This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch vector-index-dev in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/vector-index-dev by this push: new d6048f27a55 [fix] fix unstable test case (#51159) d6048f27a55 is described below commit d6048f27a556b3b5a2a164a679fed2bc7ee4536f Author: zhiqiang <hezhiqi...@selectdb.com> AuthorDate: Thu May 22 17:44:35 2025 +0800 [fix] fix unstable test case (#51159) Search result of diff faiss index object is not same if we use batch insert mode. So i refactored the test case to make sure we can compare result of native faiss and vector index of doris --- be/src/olap/rowset/segment_v2/ann_index_reader.cpp | 4 +- be/src/vec/exprs/vann_topn_predicate.cpp | 5 +- .../olap/vector_search/ann_index_reader_test.cpp | 2 +- .../olap/vector_search/ann_index_smoke_test.cpp | 2 +- .../olap/vector_search/ann_range_search_test.cpp | 14 +- .../vector_search/ann_topn_descriptor_test.cpp | 42 +- .../olap/vector_search/faiss_vector_index_test.cpp | 540 +++++++++------------ be/test/olap/vector_search/native_faiss_test.cpp | 53 +- be/test/olap/vector_search/vector_search_utils.cpp | 249 ++++++++++ be/test/olap/vector_search/vector_search_utils.h | 74 ++- 10 files changed, 653 insertions(+), 332 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/ann_index_reader.cpp b/be/src/olap/rowset/segment_v2/ann_index_reader.cpp index b1fcdee341f..0222597ea32 100644 --- a/be/src/olap/rowset/segment_v2/ann_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/ann_index_reader.cpp @@ -39,9 +39,7 @@ void AnnIndexReader::update_result(const IndexSearchResult& search_result, for (size_t i = 0; i < limit; ++i) { distance[i] = search_result.distances[i]; } - // Use the search result's row_ids to update the input row_id - // by calculating the union in-place - roaring &= *search_result.roaring; + roaring = *search_result.roaring; } AnnIndexReader::AnnIndexReader(const TabletIndex* index_meta, diff --git a/be/src/vec/exprs/vann_topn_predicate.cpp b/be/src/vec/exprs/vann_topn_predicate.cpp index 41098e43a07..d68086dcd3d 100644 --- a/be/src/vec/exprs/vann_topn_predicate.cpp +++ b/be/src/vec/exprs/vann_topn_predicate.cpp @@ -147,8 +147,9 @@ Status AnnTopNDescriptor::evaluate_vector_ann_search( result_column = ColumnFloat64::create(); ColumnFloat64* result_column_float = assert_cast<ColumnFloat64*>(result_column.get()); - result_column_float->resize(ann_query_params.distance->size()); - for (size_t i = 0; i < ann_query_params.distance->size(); ++i) { + size_t num_results = ann_query_params.distance->size(); + result_column_float->resize(num_results); + for (size_t i = 0; i < num_results; ++i) { result_column_float->get_data()[i] = (*ann_query_params.distance)[i]; } row_ids = std::move(ann_query_params.row_ids); diff --git a/be/test/olap/vector_search/ann_index_reader_test.cpp b/be/test/olap/vector_search/ann_index_reader_test.cpp index b384d0c6671..e951a48c743 100644 --- a/be/test/olap/vector_search/ann_index_reader_test.cpp +++ b/be/test/olap/vector_search/ann_index_reader_test.cpp @@ -26,7 +26,7 @@ namespace doris::segment_v2 { -using namespace vec_search_mock; +using namespace vector_search_utils; class AnnIndexReaderTest : public testing::Test {}; TEST_F(AnnIndexReaderTest, TestLoadIndex) { diff --git a/be/test/olap/vector_search/ann_index_smoke_test.cpp b/be/test/olap/vector_search/ann_index_smoke_test.cpp index 03d1d543a27..b106a9bc15e 100644 --- a/be/test/olap/vector_search/ann_index_smoke_test.cpp +++ b/be/test/olap/vector_search/ann_index_smoke_test.cpp @@ -31,7 +31,7 @@ #include "vector_index.h" #include "vector_search_utils.h" -using namespace doris::vec_search_mock; +using namespace doris::vector_search_utils; namespace doris { diff --git a/be/test/olap/vector_search/ann_range_search_test.cpp b/be/test/olap/vector_search/ann_range_search_test.cpp index 248ae3d4fd7..8b9ad847f46 100644 --- a/be/test/olap/vector_search/ann_range_search_test.cpp +++ b/be/test/olap/vector_search/ann_range_search_test.cpp @@ -886,14 +886,15 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch) { idx_to_cid[3] = 3; std::vector<std::unique_ptr<segment_v2::IndexIterator>> cid_to_index_iterators; cid_to_index_iterators.resize(4); - cid_to_index_iterators[1] = std::make_unique<doris::vec_search_mock::MockAnnIndexIterator>(); + cid_to_index_iterators[1] = + std::make_unique<doris::vector_search_utils::MockAnnIndexIterator>(); std::vector<std::unique_ptr<segment_v2::ColumnIterator>> column_iterators; column_iterators.resize(4); column_iterators[3] = std::make_unique<doris::segment_v2::VirtualColumnIterator>(); roaring::Roaring row_bitmap; - doris::vec_search_mock::MockAnnIndexIterator* mock_ann_index_iter = - dynamic_cast<doris::vec_search_mock::MockAnnIndexIterator*>( + doris::vector_search_utils::MockAnnIndexIterator* mock_ann_index_iter = + dynamic_cast<doris::vector_search_utils::MockAnnIndexIterator*>( cid_to_index_iterators[1].get()); // Explain: @@ -978,14 +979,15 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch2) { idx_to_cid[3] = 3; std::vector<std::unique_ptr<segment_v2::IndexIterator>> cid_to_index_iterators; cid_to_index_iterators.resize(4); - cid_to_index_iterators[1] = std::make_unique<doris::vec_search_mock::MockAnnIndexIterator>(); + cid_to_index_iterators[1] = + std::make_unique<doris::vector_search_utils::MockAnnIndexIterator>(); std::vector<std::unique_ptr<segment_v2::ColumnIterator>> column_iterators; column_iterators.resize(4); column_iterators[3] = std::make_unique<doris::segment_v2::VirtualColumnIterator>(); roaring::Roaring row_bitmap; - doris::vec_search_mock::MockAnnIndexIterator* mock_ann_index_iter = - dynamic_cast<doris::vec_search_mock::MockAnnIndexIterator*>( + doris::vector_search_utils::MockAnnIndexIterator* mock_ann_index_iter = + dynamic_cast<doris::vector_search_utils::MockAnnIndexIterator*>( cid_to_index_iterators[1].get()); // Explain: diff --git a/be/test/olap/vector_search/ann_topn_descriptor_test.cpp b/be/test/olap/vector_search/ann_topn_descriptor_test.cpp index b08adf73c32..7437bd80411 100644 --- a/be/test/olap/vector_search/ann_topn_descriptor_test.cpp +++ b/be/test/olap/vector_search/ann_topn_descriptor_test.cpp @@ -116,11 +116,40 @@ TEST_F(VectorSearchTest, AnnTopNDescriptorEvaluateTopN) { ASSERT_TRUE(st.ok()) << fmt::format("st: {}, expr {}", st.to_string(), predicate->get_order_by_expr_ctx()->root()->debug_string()); - std::shared_ptr<std::vector<float>> query_value = std::make_shared<std::vector<float>>(10, 0.0); + const ColumnConst* const_column = + assert_cast<const ColumnConst*>(predicate->_query_array.get()); + const ColumnArray* column_array = + assert_cast<const ColumnArray*>(const_column->get_data_column_ptr().get()); + const ColumnNullable* column_nullable = + assert_cast<const ColumnNullable*>(column_array->get_data_ptr().get()); + const ColumnFloat64* cf64 = + assert_cast<const ColumnFloat64*>(column_nullable->get_nested_column_ptr().get()); + + const double* query_value = cf64->get_data().data(); + const size_t query_value_size = cf64->get_data().size(); + ASSERT_EQ(query_value_size, 8); + std::vector<float> query_value_f32; + for (size_t i = 0; i < query_value_size; ++i) { + query_value_f32.push_back(static_cast<float>(query_value[i])); + } + ASSERT_FLOAT_EQ(query_value_f32[0], 1.0f) << "query_value_f32[0] = " << query_value_f32[0]; + ASSERT_FLOAT_EQ(query_value_f32[1], 2.0f) << "query_value_f32[1] = " << query_value_f32[1]; + ASSERT_FLOAT_EQ(query_value_f32[2], 3.0f) << "query_value_f32[2] = " << query_value_f32[2]; + ASSERT_FLOAT_EQ(query_value_f32[3], 4.0f) << "query_value_f32[3] = " << query_value_f32[3]; + ASSERT_FLOAT_EQ(query_value_f32[4], 5.0f) << "query_value_f32[4] = " << query_value_f32[4]; + ASSERT_FLOAT_EQ(query_value_f32[5], 6.0f) << "query_value_f32[5] = " << query_value_f32[5]; + ASSERT_FLOAT_EQ(query_value_f32[6], 7.0f) << "query_value_f32[6] = " << query_value_f32[6]; + ASSERT_FLOAT_EQ(query_value_f32[7], 20.0f) << "query_value_f32[7] = " << query_value_f32[7]; + + std::shared_ptr<std::vector<float>> query_vector = + std::make_shared<std::vector<float>>(10, 0.0); for (size_t i = 0; i < 10; ++i) { - (*query_value)[i] = static_cast<float>(i); + (*query_vector)[i] = static_cast<float>(i); } + std::cout << "query_vector: " << fmt::format("[{}]", fmt::join(*query_vector, ",")) + << std::endl; + EXPECT_CALL(*_ann_index_iterator, read_from_index(testing::_)) .Times(1) .WillOnce(testing::Invoke([](const segment_v2::IndexParam& value) { @@ -136,17 +165,16 @@ TEST_F(VectorSearchTest, AnnTopNDescriptorEvaluateTopN) { _result_column = ColumnFloat64::create(0, 0); std::unique_ptr<std::vector<uint64_t>> row_ids = std::make_unique<std::vector<uint64_t>>(); - // roaring is empry + roaring::Roaring roaring; st = predicate->evaluate_vector_ann_search(_ann_index_iterator.get(), roaring, _result_column, row_ids); ColumnFloat64* result_column_float = assert_cast<ColumnFloat64*>(_result_column.get()); - for (size_t i = 0; i < query_value->size(); ++i) { - EXPECT_EQ(result_column_float->get_data()[i], (*query_value)[i]); + for (size_t i = 0; i < query_vector->size(); ++i) { + EXPECT_EQ(result_column_float->get_data()[i], (*query_vector)[i]); } ASSERT_TRUE(st.ok()); - ASSERT_EQ(row_ids->size(), 0); - ASSERT_EQ(row_ids->size(), roaring.cardinality()); + ASSERT_EQ(row_ids->size(), 10); } } // namespace doris::vectorized \ No newline at end of file diff --git a/be/test/olap/vector_search/faiss_vector_index_test.cpp b/be/test/olap/vector_search/faiss_vector_index_test.cpp index 7c7b42611ec..bae49627ef2 100644 --- a/be/test/olap/vector_search/faiss_vector_index_test.cpp +++ b/be/test/olap/vector_search/faiss_vector_index_test.cpp @@ -31,109 +31,10 @@ #include "vector_index.h" #include "vector_search_utils.h" -// Generate random vectors for testing -std::vector<float> generate_random_vector(int dim) { - std::vector<float> vector(dim); - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution<float> dis(-1.0f, 1.0f); - - for (int i = 0; i < dim; i++) { - vector[i] = dis(gen); - } - return vector; -} - using namespace doris::segment_v2; namespace doris::vectorized { -// Helper function to create and configure a Doris Faiss index -static std::unique_ptr<FaissVectorIndex> create_doris_index( - int dimension, int m, - FaissBuildParameter::IndexType index_type = FaissBuildParameter::IndexType::HNSW, - FaissBuildParameter::Quantilizer quantilizer = FaissBuildParameter::Quantilizer::FLAT) { - auto index = std::make_unique<FaissVectorIndex>(); - - FaissBuildParameter params; - params.d = dimension; - params.m = m; - params.index_type = index_type; - params.quantilizer = quantilizer; - index->set_build_params(params); - - return index; -} - -// Helper function to create a native Faiss HNSW index -static std::unique_ptr<faiss::IndexHNSWFlat> create_native_index(int dimension, int m) { - return std::make_unique<faiss::IndexHNSWFlat>(dimension, m); -} - -// Helper function to generate a batch of random vectors -static std::vector<std::vector<float>> generate_test_vectors(int num_vectors, int dimension) { - std::vector<std::vector<float>> vectors; - vectors.reserve(num_vectors); - - for (int i = 0; i < num_vectors; i++) { - vectors.push_back(generate_random_vector(dimension)); - } - - return vectors; -} - -// Helper function to add vectors to both Doris and native indexes -static void add_vectors_to_indexes(FaissVectorIndex* doris_index, - faiss::IndexHNSWFlat* native_index, - const std::vector<std::vector<float>>& vectors) { - for (size_t i = 0; i < vectors.size(); i++) { - auto status = doris_index->add(1, vectors[i].data()); - ASSERT_TRUE(status.ok()) << "Failed to add vector to Doris index: " << status.to_string(); - - native_index->add(1, vectors[i].data()); - } -} - -// Helper function to print search results for comparison -[[maybe_unused]] static void print_search_results(const IndexSearchResult& doris_results, - const std::vector<float>& native_distances, - const std::vector<faiss::idx_t>& native_indices, - int query_idx) { - std::cout << "Query vector index: " << query_idx << std::endl; - - std::cout << "Doris Index Results:" << std::endl; - for (int i = 0; i < doris_results.roaring->cardinality(); i++) { - std::cout << "ID: " << doris_results.roaring->getIndex(i) - << ", Distance: " << doris_results.distances[i] << std::endl; - } - - std::cout << "Native Faiss Results:" << std::endl; - for (size_t i = 0; i < native_indices.size(); i++) { - if (native_indices[i] == -1) continue; - std::cout << "ID: " << native_indices[i] << ", Distance: " << native_distances[i] - << std::endl; - } -} - -// Helper function to compare search results between Doris and native Faiss -static void compare_search_results(const IndexSearchResult& doris_results, - const std::vector<float>& native_distances, - const std::vector<faiss::idx_t>& native_indices, - float epsilon = 1e-5) { - EXPECT_EQ(doris_results.roaring->cardinality(), - std::count_if(native_indices.begin(), native_indices.end(), - [](faiss::idx_t id) { return id != -1; })); - - for (size_t i = 0; i < native_indices.size(); i++) { - if (native_indices[i] == -1) continue; - - EXPECT_TRUE(doris_results.roaring->contains(native_indices[i])) - << "ID mismatch at rank " << i; - EXPECT_NEAR(doris_results.distances[i], native_distances[i], epsilon) - << "Distance mismatch at rank " << i; - } -} - // Test saving and loading an index TEST_F(VectorSearchTest, TestSaveAndLoad) { // Step 1: Create first index instance @@ -151,7 +52,7 @@ TEST_F(VectorSearchTest, TestSaveAndLoad) { const int num_vectors = 100; std::vector<float> vectors; for (int i = 0; i < num_vectors; i++) { - auto tmp = generate_random_vector(params.d); + auto tmp = vector_search_utils::generate_random_vector(params.d); vectors.insert(vectors.end(), tmp.begin(), tmp.end()); } @@ -169,7 +70,7 @@ TEST_F(VectorSearchTest, TestSaveAndLoad) { ASSERT_TRUE(load_status.ok()) << "Failed to load index: " << load_status.to_string(); // Step 7: Verify the loaded index works by searching - auto query_vec = generate_random_vector(params.d); + auto query_vec = vector_search_utils::generate_random_vector(params.d); const int top_k = 10; IndexSearchParameters search_params; @@ -221,132 +122,173 @@ TEST_F(VectorSearchTest, UpdateRoaring) { } TEST_F(VectorSearchTest, CompareResultWithNativeFaiss1) { - // Step 1: Create indexes - const int dimension = 64; - const int max_connections = 16; - auto doris_index = create_doris_index(dimension, max_connections); - auto native_index = create_native_index(dimension, max_connections); - - // Step 2: Generate vectors and add to indexes - const int num_vectors = 200; - auto vectors = generate_test_vectors(num_vectors, dimension); - add_vectors_to_indexes(doris_index.get(), native_index.get(), vectors); - - // Step 3: Search - int query_idx = num_vectors / 2; - const float* query_vec = vectors[query_idx].data(); - const int top_k = 10; - - // Search in Doris index - IndexSearchParameters search_params; - IndexSearchResult doris_results; - auto search_status = - doris_index->ann_topn_search(query_vec, top_k, search_params, doris_results); - ASSERT_EQ(search_status.ok(), true); - - // Search in native Faiss index - std::vector<float> native_distances(top_k); - std::vector<faiss::idx_t> native_indices(top_k); - native_index->search(1, query_vec, top_k, native_distances.data(), native_indices.data()); - - // Step 4: Print and compare results - // print_search_results(doris_results, native_distances, native_indices, query_idx); - compare_search_results(doris_results, native_distances, native_indices); + const size_t iterations = 50; + // Create random number generator + std::random_device rd; + std::mt19937 gen(rd()); + // Define fixed parameter sets to choose from + const std::vector<int> dimensions = {32, 64, 128, 256}; + const std::vector<int> max_connections = {8, 16, 32, 64}; + const std::vector<int> vector_counts = {100, 200, 500, 1000}; + + for (size_t iter = 0; iter < iterations; ++iter) { + // Randomly select parameters from the fixed sets + const int dimension = + dimensions[std::uniform_int_distribution<>(0, dimensions.size() - 1)(gen)]; + const int max_connection = max_connections[std::uniform_int_distribution<>( + 0, max_connections.size() - 1)(gen)]; + const int num_vectors = + vector_counts[std::uniform_int_distribution<>(0, vector_counts.size() - 1)(gen)]; + + // Step 1: Create indexes + auto doris_index = doris::vector_search_utils::create_doris_index( + doris::vector_search_utils::IndexType::HNSW, dimension, max_connection); + auto native_index = doris::vector_search_utils::create_native_index( + doris::vector_search_utils::IndexType::HNSW, dimension, max_connection); + + // Step 2: Generate vectors and add to indexes + auto vectors = + doris::vector_search_utils::generate_test_vectors_matrix(num_vectors, dimension); + doris::vector_search_utils::add_vectors_to_indexes_serial_mode(doris_index.get(), + native_index.get(), vectors); + + // Step 3: Search + int query_idx = num_vectors / 2; + const float* query_vec = vectors[query_idx].data(); + const int top_k = 10; + + // Search in Doris index + IndexSearchParameters search_params; + IndexSearchResult doris_results; + auto search_status = + doris_index->ann_topn_search(query_vec, top_k, search_params, doris_results); + ASSERT_EQ(search_status.ok(), true) + << "Search failed with dimension=" << dimension + << ", max_connections=" << max_connection << ", num_vectors=" << num_vectors; + + // Search in native Faiss index + std::vector<float> native_distances(top_k); + std::vector<faiss::idx_t> native_indices(top_k); + native_index->search(1, query_vec, top_k, native_distances.data(), native_indices.data()); + + // Step 4: Compare results + vector_search_utils::compare_search_results(doris_results, native_distances, + native_indices); + } } TEST_F(VectorSearchTest, CompareResultWithNativeFaiss2) { - // Step 1: Create indexes - const int dimension = 64; - const int max_connections = 16; - auto doris_index = create_doris_index(dimension, max_connections); - auto native_index = create_native_index(dimension, max_connections); - - // Step 2: Generate vectors and add to indexes - const int num_vectors = 500; - auto vectors = generate_test_vectors(num_vectors, dimension); - add_vectors_to_indexes(doris_index.get(), native_index.get(), vectors); - - // Step 3: Search - int query_idx = num_vectors / 2; - const float* query_vec = vectors[query_idx].data(); - const int top_k = num_vectors; - - // Search in Doris index - IndexSearchParameters search_params; - IndexSearchResult doris_results; - auto search_status = - doris_index->ann_topn_search(query_vec, top_k, search_params, doris_results); - ASSERT_EQ(search_status.ok(), true); - - // Search in native Faiss index - std::vector<float> native_distances(top_k, -1); - std::vector<faiss::idx_t> native_indices(top_k, -1); - native_index->search(1, query_vec, top_k, native_distances.data(), native_indices.data()); - - // Step 4: Print and compare results - // print_search_results(doris_results, native_distances, native_indices, query_idx); - compare_search_results(doris_results, native_distances, native_indices); + const size_t iterations = 50; + // Create random number generator + std::random_device rd; + std::mt19937 gen(rd()); + // Define fixed parameter sets to choose from + const std::vector<int> dimensions = {32, 64, 128, 256}; + const std::vector<int> max_connections = {8, 16, 32, 64}; + const std::vector<int> vector_counts = {100, 200, 500, 1000}; + + for (size_t i = 0; i < iterations; ++i) { + // Randomly select parameters from the fixed sets + const int dimension = + dimensions[std::uniform_int_distribution<>(0, dimensions.size() - 1)(gen)]; + const int max_connection = max_connections[std::uniform_int_distribution<>( + 0, max_connections.size() - 1)(gen)]; + const int num_vectors = + vector_counts[std::uniform_int_distribution<>(0, vector_counts.size() - 1)(gen)]; + + // Step 1: Create indexes + auto doris_index = doris::vector_search_utils::create_doris_index( + doris::vector_search_utils::IndexType::HNSW, dimension, max_connection); + auto native_index = doris::vector_search_utils::create_native_index( + doris::vector_search_utils::IndexType::HNSW, dimension, max_connection); + + // Step 2: Generate vectors and add to indexes + std::vector<std::vector<float>> vectors = + doris::vector_search_utils::generate_test_vectors_matrix(num_vectors, dimension); + doris::vector_search_utils::add_vectors_to_indexes_serial_mode(doris_index.get(), + native_index.get(), vectors); + + // Step 3: Search + int query_idx = num_vectors / 2; + const float* query_vec = vectors[query_idx].data(); + const int top_k = num_vectors; + IndexSearchParameters search_params; + IndexSearchResult doris_results; + std::ignore = doris_index->ann_topn_search(query_vec, top_k, search_params, doris_results); + + // Search in native Faiss index + std::vector<float> native_distances(top_k, -1); + std::vector<faiss::idx_t> native_indices(top_k, -1); + native_index->search(1, query_vec, top_k, native_distances.data(), native_indices.data()); + + // Step 4: Compare results + doris::vector_search_utils::compare_search_results(doris_results, native_distances, + native_indices); + } } TEST_F(VectorSearchTest, SearchAllVectors) { - // Step 1: Create and build index - auto index1 = std::make_unique<FaissVectorIndex>(); + size_t iterations = 25; + for (size_t i = 0; i < iterations; ++i) { + // Step 1: Create and build index + auto index1 = std::make_unique<FaissVectorIndex>(); - FaissBuildParameter params; - params.d = 64; - params.m = 32; - params.index_type = FaissBuildParameter::IndexType::HNSW; - params.quantilizer = FaissBuildParameter::Quantilizer::FLAT; - index1->set_build_params(params); + FaissBuildParameter params; + params.d = 64; + params.m = 32; + params.index_type = FaissBuildParameter::IndexType::HNSW; + params.quantilizer = FaissBuildParameter::Quantilizer::FLAT; + index1->set_build_params(params); - // Add 500 vectors - const int num_vectors = 500; - std::vector<float> vectors; - for (int i = 0; i < num_vectors; i++) { - auto vec = generate_random_vector(params.d); - vectors.insert(vectors.end(), vec.begin(), vec.end()); - } + // Add 500 vectors + const int num_vectors = 500; + std::vector<float> vectors; + for (int i = 0; i < num_vectors; i++) { + auto vec = doris::vector_search_utils::generate_random_vector(params.d); + vectors.insert(vectors.end(), vec.begin(), vec.end()); + } - ASSERT_EQ(index1->add(500, vectors.data()).ok(), true); + ASSERT_EQ(index1->add(500, vectors.data()).ok(), true); - // Save index - ASSERT_TRUE(index1->save(_ram_dir.get()).ok()); + // Save index + ASSERT_TRUE(index1->save(_ram_dir.get()).ok()); - // Step 2: Load index - auto index2 = std::make_unique<FaissVectorIndex>(); - ASSERT_TRUE(index2->load(_ram_dir.get()).ok()); + // Step 2: Load index + auto index2 = std::make_unique<FaissVectorIndex>(); + ASSERT_TRUE(index2->load(_ram_dir.get()).ok()); - // Step 3: Search all vectors - IndexSearchParameters search_params; - IndexSearchResult search_result; - - // Search for all vectors - use a vector we know is in the index - std::vector<float> query_vec {vectors.begin(), - vectors.begin() + params.d}; // Use the first vector we added - const int top_k = num_vectors; // Get all vectors - - ASSERT_EQ(index2->ann_topn_search(query_vec.data(), top_k, search_params, search_result).ok(), - true); - std::cout << "Search returned " << search_result.roaring->cardinality() << " results\n"; - // Step 4: Verify we got all vectors back - // Note: In practical ANN search with approximate algorithms like HNSW, - // we might not get exactly all vectors due to the nature of approximate search. - // So we verify we got a reasonable number back. - EXPECT_GE(search_result.roaring->cardinality(), num_vectors * 0.60) - << "Expected to find at least 60% of all vectors"; - - // Also verify the first result is the query vector itself (it should be an exact match) - ASSERT_EQ(search_result.roaring->isEmpty(), false) << "Search result should not be empty"; - size_t first = search_result.roaring->getIndex(0); - std::vector<float> first_result_vec(vectors.begin() + first * params.d, - vectors.begin() + (first + 1) * params.d); - std::string query_vec_str = fmt::format("[{}]", fmt::join(query_vec, ",")); - std::string first_result_vec_str = fmt::format("[{}]", fmt::join(first_result_vec, ",")); - EXPECT_EQ(first_result_vec, query_vec) << "First result should be the query vector itself"; + // Step 3: Search all vectors + IndexSearchParameters search_params; + IndexSearchResult search_result; + + // Search for all vectors - use a vector we know is in the index + std::vector<float> query_vec {vectors.begin(), + vectors.begin() + params.d}; // Use the first vector we added + const int top_k = num_vectors; // Get all vectors + + ASSERT_EQ( + index2->ann_topn_search(query_vec.data(), top_k, search_params, search_result).ok(), + true); + // Step 4: Verify we got all vectors back + // Note: In practical ANN search with approximate algorithms like HNSW, + // we might not get exactly all vectors due to the nature of approximate search. + // So we verify we got a reasonable number back. + EXPECT_GE(search_result.roaring->cardinality(), num_vectors * 0.60) + << "Expected to find at least 60% of all vectors"; + + // Also verify the first result is the query vector itself (it should be an exact match) + ASSERT_EQ(search_result.roaring->isEmpty(), false) << "Search result should not be empty"; + size_t first = search_result.roaring->getIndex(0); + std::vector<float> first_result_vec(vectors.begin() + first * params.d, + vectors.begin() + (first + 1) * params.d); + std::string query_vec_str = fmt::format("[{}]", fmt::join(query_vec, ",")); + std::string first_result_vec_str = fmt::format("[{}]", fmt::join(first_result_vec, ",")); + EXPECT_EQ(first_result_vec, query_vec) << "First result should be the query vector itself"; + } } TEST_F(VectorSearchTest, CompRangeSearch) { - size_t iterations = 50; + size_t iterations = 25; for (size_t i = 0; i < iterations; ++i) { // Random parameters for each test iteration std::random_device rd; @@ -367,35 +309,36 @@ TEST_F(VectorSearchTest, CompRangeSearch) { index1->set_build_params(params); const int num_vectors = random_n; - std::vector<float> vectors; + std::vector<std::vector<float>> vectors; for (int i = 0; i < num_vectors; i++) { - auto vec = generate_random_vector(params.d); - vectors.insert(vectors.end(), vec.begin(), vec.end()); + auto vec = vector_search_utils::generate_random_vector(params.d); + vectors.push_back(vec); } std::unique_ptr<faiss::Index> native_index = std::make_unique<faiss::IndexHNSWFlat>(params.d, params.m); - native_index->add(num_vectors, vectors.data()); - std::ignore = index1->add(num_vectors, vectors.data()); + doris::vector_search_utils::add_vectors_to_indexes_serial_mode(index1.get(), + native_index.get(), vectors); - std::vector<float> query_vec {vectors.begin(), - vectors.begin() + params.d}; // Use the first vector we added + std::vector<float> query_vec = vectors.front(); std::vector<std::pair<size_t, float>> distances(num_vectors); for (int i = 0; i < num_vectors; i++) { double sum = 0; + auto& vec = vectors[i]; for (int j = 0; j < params.d; j++) { - accumulate(vectors[i * params.d + j], query_vec[j], sum); + accumulate(vec[j], query_vec[j], sum); } distances[i] = std::make_pair(i, finalize(sum)); } std::sort(distances.begin(), distances.end(), [](const auto& a, const auto& b) { return a.second < b.second; }); - // Use the median distance as the radius - float radius = distances[num_vectors / 2].second; + + float radius = distances[num_vectors / 4].second; HNSWSearchParameters hnsw_params; hnsw_params.ef_search = 16; // Set efSearch for better accuracy hnsw_params.roaring = nullptr; // No selector for this test + hnsw_params.is_le_or_lt = true; IndexSearchResult doris_result; std::ignore = index1->range_search(query_vec.data(), radius, hnsw_params, doris_result); @@ -452,30 +395,31 @@ TEST_F(VectorSearchTest, RangeSearchNoSelector1) { index1->set_build_params(params); const int num_vectors = random_n; - std::vector<float> vectors; + std::vector<std::vector<float>> vectors; for (int i = 0; i < num_vectors; i++) { - auto vec = generate_random_vector(params.d); - vectors.insert(vectors.end(), vec.begin(), vec.end()); + auto vec = vector_search_utils::generate_random_vector(params.d); + vectors.push_back(vec); } + std::unique_ptr<faiss::Index> native_index = + std::make_unique<faiss::IndexHNSWFlat>(params.d, params.m); + doris::vector_search_utils::add_vectors_to_indexes_serial_mode(index1.get(), + native_index.get(), vectors); + + std::vector<float> query_vec = vectors.front(); std::vector<std::pair<size_t, float>> distances(num_vectors); for (int i = 0; i < num_vectors; i++) { double sum = 0; + auto& vec = vectors[i]; for (int j = 0; j < params.d; j++) { - accumulate(vectors[i * params.d + j], vectors[j], sum); + accumulate(vec[j], query_vec[j], sum); } distances[i] = std::make_pair(i, finalize(sum)); } std::sort(distances.begin(), distances.end(), [](const auto& a, const auto& b) { return a.second < b.second; }); - // Use the median distance as the radius - float radius = distances[num_vectors / 2].second; - - std::unique_ptr<faiss::Index> native_index = - std::make_unique<faiss::IndexHNSWFlat>(params.d, params.m, faiss::METRIC_L2); - native_index->add(num_vectors, vectors.data()); - std::ignore = index1->add(num_vectors, vectors.data()); + float radius = distances[num_vectors / 4].second; // Save index ASSERT_TRUE(index1->save(_ram_dir.get()).ok()); @@ -485,8 +429,6 @@ TEST_F(VectorSearchTest, RangeSearchNoSelector1) { // Step 3: Range search // Use a vector we know is in the index - std::vector<float> query_vec {vectors.begin(), - vectors.begin() + params.d}; // Use the first vector we added faiss::SearchParametersHNSW search_params; search_params.efSearch = 16; // Set efSearch for better accuracy @@ -568,17 +510,20 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) { index1->set_build_params(params); const int num_vectors = 1000; - std::vector<float> vectors; + std::vector<std::vector<float>> vectors; for (int i = 0; i < num_vectors; i++) { - auto vec = generate_random_vector(params.d); - vectors.insert(vectors.end(), vec.begin(), vec.end()); + auto vec = vector_search_utils::generate_random_vector(params.d); + vectors.push_back(vec); } + // Use a vector we know is in the index + std::vector<float> query_vec = vectors.front(); std::vector<std::pair<size_t, float>> distances(num_vectors); for (int i = 0; i < num_vectors; i++) { double sum = 0; + auto& vec = vectors[i]; for (int j = 0; j < params.d; j++) { - accumulate(vectors[i * params.d + j], vectors[j], sum); + accumulate(vec[j], query_vec[j], sum); } distances[i] = std::make_pair(i, finalize(sum)); } @@ -589,8 +534,8 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) { std::unique_ptr<faiss::Index> native_index = std::make_unique<faiss::IndexHNSWFlat>(params.d, params.m, faiss::METRIC_L2); - native_index->add(num_vectors, vectors.data()); - std::ignore = index1->add(num_vectors, vectors.data()); + doris::vector_search_utils::add_vectors_to_indexes_serial_mode(index1.get(), + native_index.get(), vectors); std::unique_ptr<roaring::Roaring> all_rows = std::make_unique<roaring::Roaring>(); std::unique_ptr<roaring::Roaring> sel_rows = std::make_unique<roaring::Roaring>(); @@ -600,11 +545,8 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) { sel_rows->add(i); } } - // Step 3: Range search - // Use a vector we know is in the index - std::vector<float> query_vec {vectors.begin(), - vectors.begin() + params.d}; // Use the first vector we added + // Step 3: Range search faiss::SearchParametersHNSW search_params; search_params.efSearch = 16; // Set efSearch for better accuracy auto faiss_selector = segment_v2::FaissVectorIndex::roaring_to_faiss_selector(*sel_rows); @@ -612,7 +554,7 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) { faiss::RangeSearchResult native_search_result(1, true); native_index->range_search(1, query_vec.data(), radius * radius, &native_search_result, &search_params); - + // labels and distance std::vector<std::pair<int, float>> native_results; size_t begin = native_search_result.lims[0]; size_t end = native_search_result.lims[1]; @@ -622,7 +564,7 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) { } HNSWSearchParameters doris_search_params; - doris_search_params.ef_search = 16; // Set efSearch for better accuracy + doris_search_params.ef_search = search_params.efSearch; doris_search_params.is_le_or_lt = true; doris_search_params.roaring = sel_rows.get(); IndexSearchResult doris_search_result; @@ -644,11 +586,17 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) { } doris_search_params.is_le_or_lt = false; - roaring::Roaring ge_rows = *doris_search_params.roaring; + IndexSearchResult doris_search_result2; + ASSERT_EQ(index1->range_search(query_vec.data(), radius, doris_search_params, + doris_search_result2) + .ok(), + true); + roaring::Roaring ge_rows = *doris_search_result2.roaring; roaring::Roaring less_rows; for (size_t i = 0; i < native_results.size(); ++i) { less_rows.add(native_results[i].first); } + // result2 contains all rows that not included by result1 roaring::Roaring and_row_id = ge_rows & less_rows; roaring::Roaring or_row_id = ge_rows | less_rows; ASSERT_NEAR(and_row_id.cardinality(), 0, 1); @@ -659,14 +607,12 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) { TEST_F(VectorSearchTest, RangeSearchEmptyResult) { for (size_t i = 0; i < 10; ++i) { - auto index1 = std::make_unique<FaissVectorIndex>(); - FaissBuildParameter params; - params.d = 10; - params.m = 32; - params.index_type = FaissBuildParameter::IndexType::HNSW; - index1->set_build_params(params); - + const size_t d = 10; + const size_t m = 32; const int num_vectors = 1000; + auto index1 = + vector_search_utils::create_doris_index(vector_search_utils::IndexType::HNSW, d, m); + std::vector<float> vectors; // Create 1000 vectors and make sure their l2_distance with [1,2,3,4,5,6,7,8,9,10] is less than 100 // [1,2,3,4,5,6,7,8,9,10] @@ -679,73 +625,66 @@ TEST_F(VectorSearchTest, RangeSearchEmptyResult) { rowid = rowid % 10; } - std::vector<float> vec(params.d); - for (int colid = 0; colid < params.d; colid++) { + std::vector<float> vec(d); + for (int colid = 0; colid < d; colid++) { vec[colid] = (rowid + colid) % 10 + 1; } vectors.insert(vectors.end(), vec.begin(), vec.end()); } - // Find the min - float radius = 5.0f; - - std::unique_ptr<faiss::Index> native_index = - std::make_unique<faiss::IndexHNSWFlat>(params.d, params.m, faiss::METRIC_L2); - - native_index->add(num_vectors, vectors.data()); - std::ignore = index1->add(num_vectors, vectors.data()); + std::unique_ptr<faiss::Index> native_index = vector_search_utils::create_native_index( + vector_search_utils::IndexType::HNSW, d, m); + vector_search_utils::add_vectors_to_indexes_batch_mode(index1.get(), native_index.get(), + num_vectors, vectors); std::vector<float> query_vec; - for (int i = 0; i < params.d; i++) { + for (int i = 0; i < d; i++) { query_vec.push_back(5.0f); } - // L2 distance between [5,5,5,5,5,5,5,5,5,5] with any vector add added is large than 5 and less than 250. - faiss::SearchParametersHNSW search_params; - search_params.efSearch = 1000; // Set efSearch for better accuracy - faiss::RangeSearchResult native_search_result(1, true); - native_index->range_search(1, query_vec.data(), radius * radius, &native_search_result); + // L2 distance between [5,5,5,5,5,5,5,5,5,5] with any other vector is large than 5 and less than 250. + // Find the min + float radius = 5.0f; + doris::segment_v2::HNSWSearchParameters search_params; + search_params.ef_search = 1000; // Set efSearch for better accuracy + auto doris_search_result = vector_search_utils::perform_doris_index_range_search( + index1.get(), query_vec.data(), radius, search_params); + auto native_search_result = vector_search_utils::perform_native_index_range_search( + native_index.get(), query_vec.data(), radius); - std::vector<std::pair<int, float>> native_results; - size_t begin = native_search_result.lims[0]; - size_t end = native_search_result.lims[1]; - for (size_t i = begin; i < end; i++) { - native_results.push_back( - {native_search_result.labels[i], native_search_result.distances[i]}); - } + ASSERT_EQ(doris_search_result->roaring->cardinality(), 0); + ASSERT_EQ(native_search_result.size(), 0); - HNSWSearchParameters doris_search_params; - doris_search_params.ef_search = 1000; // Set efSearch for better accuracy - doris_search_params.is_le_or_lt = true; + // Search all rows. + doris::segment_v2::HNSWSearchParameters search_params_all_rows; + search_params_all_rows.ef_search = 1000; // Set efSearch for better accuracy + search_params_all_rows.is_le_or_lt = true; std::unique_ptr<roaring::Roaring> sel_rows = std::make_unique<roaring::Roaring>(); for (size_t i = 0; i < num_vectors; ++i) { sel_rows->add(i); } + search_params_all_rows.roaring = sel_rows.get(); + doris_search_result = vector_search_utils::perform_doris_index_range_search( + index1.get(), query_vec.data(), radius, search_params_all_rows); + ASSERT_EQ(doris_search_result->distances != nullptr, true); - // Search all rows. - doris_search_params.roaring = sel_rows.get(); - IndexSearchResult doris_search_result; - - ASSERT_EQ(index1->range_search(query_vec.data(), radius, doris_search_params, - doris_search_result) - .ok(), - true); + native_search_result = vector_search_utils::perform_native_index_range_search( + native_index.get(), query_vec.data(), radius); - ASSERT_EQ(native_results.size(), doris_search_result.roaring->cardinality()); - ASSERT_EQ(doris_search_result.distances != nullptr, true); - for (size_t i = 0; i < native_results.size(); i++) { - const size_t rowid = native_results[i].first; - const float dis = native_results[i].second; - ASSERT_EQ(doris_search_result.roaring->contains(rowid), true) + // Make sure result is same + for (size_t i = 0; i < native_search_result.size(); i++) { + const size_t rowid = native_search_result[i].first; + const float dis = native_search_result[i].second; + ASSERT_EQ(doris_search_result->roaring->contains(rowid), true) << "Row ID mismatch at rank " << i; - ASSERT_FLOAT_EQ(doris_search_result.distances[i], sqrt(dis)) + ASSERT_FLOAT_EQ(doris_search_result->distances[i], sqrt(dis)) << "Distance mismatch at rank " << i; } - doris_search_params.is_le_or_lt = false; - roaring::Roaring ge_rows = *doris_search_params.roaring; + search_params_all_rows.is_le_or_lt = false; + roaring::Roaring ge_rows = *search_params_all_rows.roaring; roaring::Roaring less_rows; - for (size_t i = 0; i < native_results.size(); ++i) { - less_rows.add(native_results[i].first); + for (size_t i = 0; i < native_search_result.size(); ++i) { + less_rows.add(native_search_result[i].first); } roaring::Roaring and_row_id = ge_rows & less_rows; roaring::Roaring or_row_id = ge_rows | less_rows; @@ -762,4 +701,5 @@ TEST_F(VectorSearchTest, TestIdSelectorWithEmptyRoaring) { ASSERT_EQ(sel->is_member(i), false) << "Selector should be empty"; } } + } // namespace doris::vectorized \ No newline at end of file diff --git a/be/test/olap/vector_search/native_faiss_test.cpp b/be/test/olap/vector_search/native_faiss_test.cpp index dc7b5da3046..1cfdd12019f 100644 --- a/be/test/olap/vector_search/native_faiss_test.cpp +++ b/be/test/olap/vector_search/native_faiss_test.cpp @@ -25,9 +25,11 @@ #include <iomanip> #include <roaring/roaring.hh> -#include "vector/faiss_vector_index.h" +#include "olap/vector_search/vector_search_utils.h" +namespace doris::vector_search_utils { std::vector<float> generate_random_vector(int dim); +} class NativeFaissTest : public ::testing::Test { public: @@ -233,7 +235,7 @@ TEST_F(NativeFaissTest, TestRangeSearch10000Random) { int n = 1000; std::vector<float> data_vectors; for (int i = 0; i < n; ++i) { - auto random_vector = generate_random_vector(dim); + auto random_vector = doris::vector_search_utils::generate_random_vector(dim); data_vectors.insert(data_vectors.end(), random_vector.begin(), random_vector.end()); } @@ -287,7 +289,7 @@ TEST_F(NativeFaissTest, TestRangeSearchRandomVectorsSearchMedian) { // Generate random vectors std::vector<float> data_vectors; for (int i = 0; i < n; ++i) { - auto random_vector = generate_random_vector(dim); + auto random_vector = doris::vector_search_utils::generate_random_vector(dim); data_vectors.insert(data_vectors.end(), random_vector.begin(), random_vector.end()); } @@ -367,13 +369,15 @@ TEST_F(NativeFaissTest, TestTopNRandomVectorSearchMedian) { // Generate random vectors std::vector<float> data_vectors; for (int i = 0; i < n; ++i) { - auto random_vector = generate_random_vector(dim); + auto random_vector = + doris::vector_search_utils::generate_random_vector(dim); data_vectors.insert(data_vectors.end(), random_vector.begin(), random_vector.end()); } // Use a random vector as query - std::vector<float> query_vector = generate_random_vector(dim); + std::vector<float> query_vector = + doris::vector_search_utils::generate_random_vector(dim); // Brute force search to find ground truth top-k std::vector<std::pair<int, float>> all_distances(n); @@ -436,7 +440,7 @@ TEST_F(NativeFaissTest, SameTypeIndexDiffObject) { int n = 1000; std::vector<float> data_vectors; for (int i = 0; i < n; ++i) { - auto random_vector = generate_random_vector(dim); + auto random_vector = doris::vector_search_utils::generate_random_vector(dim); data_vectors.insert(data_vectors.end(), random_vector.begin(), random_vector.end()); } @@ -475,4 +479,41 @@ TEST_F(NativeFaissTest, SameTypeIndexDiffObject) { ASSERT_FLOAT_EQ(actual_results1[i].second, actual_results2[i].second); ASSERT_FLOAT_EQ(actual_results1[i].second, actual_results3[i].second); } +} + +TEST_F(NativeFaissTest, BatchInsert) { + size_t iterations = 5; + for (size_t i = 0; i < iterations; ++i) { + const size_t d = 100; + const size_t m = 32; + const int num_vectors = 1000; + + auto index1 = doris::vector_search_utils::create_native_index( + doris::vector_search_utils::IndexType::HNSW, d, m); + + auto index2 = doris::vector_search_utils::create_native_index( + doris::vector_search_utils::IndexType::HNSW, d, m); + + auto flatten_vectors = + doris::vector_search_utils::generate_test_vectors_flatten(num_vectors, d); + + doris::vector_search_utils::add_vectors_to_indexes_batch_mode(nullptr, index1.get(), + num_vectors, flatten_vectors); + doris::vector_search_utils::add_vectors_to_indexes_batch_mode(nullptr, index2.get(), + num_vectors, flatten_vectors); + + // Use a random vector as query + std::vector<float> query_vector = doris::vector_search_utils::generate_random_vector(d); + float radius = doris::vector_search_utils::get_radius_from_flatten(query_vector.data(), d, + flatten_vectors, 0.4); + + // Get index-based results using HNSW index + auto res1 = doris::vector_search_utils::perform_native_index_range_search( + index1.get(), query_vector.data(), radius); + auto res2 = doris::vector_search_utils::perform_native_index_range_search( + index2.get(), query_vector.data(), radius); + // Insert vector using batch insert will make result is different. + std::cout << "res1 size: " << res1.size() << std::endl; + std::cout << "res2 size: " << res2.size() << std::endl; + } } \ No newline at end of file diff --git a/be/test/olap/vector_search/vector_search_utils.cpp b/be/test/olap/vector_search/vector_search_utils.cpp new file mode 100644 index 00000000000..6ee03666bbf --- /dev/null +++ b/be/test/olap/vector_search/vector_search_utils.cpp @@ -0,0 +1,249 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vector_search_utils.h" + +#include <faiss/IndexHNSW.h> + +#include <cstddef> +#include <memory> + +#include "faiss_vector_index.h" +#include "vector_index.h" + +namespace doris::vector_search_utils { +static void accumulate(double x, double y, double& sum) { + sum += (x - y) * (x - y); +} +static double finalize(double sum) { + return sqrt(sum); +} + +// Generate random vectors for testing +std::vector<float> generate_random_vector(int dim) { + std::vector<float> vector(dim); + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution<float> dis(-1.0f, 1.0f); + + for (int i = 0; i < dim; i++) { + vector[i] = dis(gen); + } + return vector; +} + +// Helper function to create and configure a Doris Vector index +std::unique_ptr<doris::segment_v2::VectorIndex> create_doris_index(IndexType index_type, + int dimension, int m) { + auto index = std::make_unique<doris::segment_v2::FaissVectorIndex>(); + segment_v2::FaissBuildParameter params; + params.d = dimension; + params.m = m; + switch (index_type) { + case IndexType::FLAT_L2: + params.index_type = segment_v2::FaissBuildParameter::IndexType::BruteForce; + break; + case IndexType::HNSW: + params.index_type = segment_v2::FaissBuildParameter::IndexType::HNSW; + break; + default: + throw std::invalid_argument("Unsupported index type"); + } + params.quantilizer = segment_v2::FaissBuildParameter::Quantilizer::FLAT; + index->set_build_params(params); + return std::move(index); +} + +// Helper function to create a native Faiss index +std::unique_ptr<faiss::Index> create_native_index(IndexType type, int dimension, int m) { + std::unique_ptr<faiss::Index> index; + + switch (type) { + case IndexType::FLAT_L2: + index = std::make_unique<faiss::IndexFlatL2>(dimension); + break; + case IndexType::HNSW: + index = std::make_unique<faiss::IndexHNSWFlat>(dimension, m, faiss::METRIC_L2); + break; + default: + throw std::invalid_argument("Unsupported index type"); + } + + return index; +} + +// Helper function to generate a batch of random vectors +std::vector<std::vector<float>> generate_test_vectors_matrix(int num_vectors, int dimension) { + std::vector<std::vector<float>> vectors; + vectors.reserve(num_vectors); + + for (int i = 0; i < num_vectors; i++) { + vectors.push_back(generate_random_vector(dimension)); + } + + return vectors; +} + +std::vector<float> generate_test_vectors_flatten(int num_vectors, int dimension) { + std::vector<float> vectors; + vectors.reserve(num_vectors * dimension); + + for (int i = 0; i < num_vectors; i++) { + auto tmp = generate_random_vector(dimension); + vectors.insert(vectors.end(), tmp.begin(), tmp.end()); + } + + return vectors; +} + +// Helper function to add vectors to both Doris and native indexes +void add_vectors_to_indexes_serial_mode(segment_v2::VectorIndex* doris_index, + faiss::Index* native_index, + const std::vector<std::vector<float>>& vectors) { + for (size_t i = 0; i < vectors.size(); i++) { + if (doris_index) { + auto status = doris_index->add(1, vectors[i].data()); + ASSERT_TRUE(status.ok()) + << "Failed to add vector to Doris index: " << status.to_string(); + } + if (native_index) { + // Add vector to native Faiss index + native_index->add(1, vectors[i].data()); + } + } +} + +void add_vectors_to_indexes_batch_mode(segment_v2::VectorIndex* doris_index, + faiss::Index* native_index, size_t num_vectors, + const std::vector<float>& flatten_vectors) { + if (doris_index) { + auto status = doris_index->add(num_vectors, flatten_vectors.data()); + ASSERT_TRUE(status.ok()) << "Failed to add vectors to Doris index: " << status.to_string(); + } + + if (native_index) { + // Add vectors to native Faiss index + native_index->add(num_vectors, flatten_vectors.data()); + } +} + +// Helper function to print search results for comparison +void print_search_results(const segment_v2::IndexSearchResult& doris_results, + const std::vector<float>& native_distances, + const std::vector<faiss::idx_t>& native_indices, int query_idx) { + std::cout << "Query vector index: " << query_idx << std::endl; + + std::cout << "Doris Index Results:" << std::endl; + for (int i = 0; i < doris_results.roaring->cardinality(); i++) { + std::cout << "ID: " << doris_results.roaring->getIndex(i) + << ", Distance: " << doris_results.distances[i] << std::endl; + } + + std::cout << "Native Faiss Results:" << std::endl; + for (size_t i = 0; i < native_indices.size(); i++) { + if (native_indices[i] == -1) continue; + std::cout << "ID: " << native_indices[i] << ", Distance: " << native_distances[i] + << std::endl; + } +} + +// Helper function to compare search results between Doris and native Faiss +void compare_search_results(const segment_v2::IndexSearchResult& doris_results, + const std::vector<float>& native_distances, + const std::vector<faiss::idx_t>& native_indices, float abs_error) { + EXPECT_EQ(doris_results.roaring->cardinality(), + std::count_if(native_indices.begin(), native_indices.end(), + [](faiss::idx_t id) { return id != -1; })); + + for (size_t i = 0; i < native_indices.size(); i++) { + if (native_indices[i] == -1) continue; + + EXPECT_TRUE(doris_results.roaring->contains(native_indices[i])) + << "ID mismatch at rank " << i; + EXPECT_NEAR(doris_results.distances[i], native_distances[i], abs_error) + << "Distance mismatch at rank " << i; + } +} + +// result is a vector of pairs, where each pair contains the labels and distance +// result is sorted by labels +std::vector<std::pair<int, float>> perform_native_index_range_search(faiss::Index* index, + const float* query_vector, + float radius) { + std::vector<std::pair<int, float>> results; + faiss::RangeSearchResult result(1); + index->range_search(1, query_vector, radius * radius, &result); + size_t begin = result.lims[0]; + size_t end = result.lims[1]; + results.reserve(end - begin); + for (size_t j = begin; j < end; ++j) { + results.push_back({result.labels[j], sqrt(result.distances[j])}); + } + std::sort(results.begin(), results.end(), + [](const auto& a, const auto& b) { return a.first < b.first; }); + return results; +} + +std::unique_ptr<doris::segment_v2::IndexSearchResult> perform_doris_index_range_search( + segment_v2::VectorIndex* index, const float* query_vector, float radius, + const segment_v2::IndexSearchParameters& params) { + auto result = std::make_unique<doris::segment_v2::IndexSearchResult>(); + std::ignore = index->range_search(query_vector, radius, params, *result); + return result; +} + +float get_radius_from_flatten(const float* vector, int dim, + const std::vector<float>& flatten_vectors, float percentile) { + size_t n = flatten_vectors.size() / dim; + std::vector<std::pair<size_t, float>> distances(n); + for (int i = 0; i < n; i++) { + double sum = 0; + for (int j = 0; j < dim; j++) { + accumulate(flatten_vectors[i * dim + j], flatten_vectors[j], sum); + } + distances[i] = std::make_pair(i, finalize(sum)); + } + std::sort(distances.begin(), distances.end(), + [](const auto& a, const auto& b) { return a.second < b.second; }); + // Use the median distance as the radius + size_t percentile_index = static_cast<size_t>(n * percentile); + float radius = distances[percentile_index].second; + + return radius; +} + +float get_radius_from_matrix(const float* vector, int dim, + const std::vector<std::vector<float>>& matrix_vectors, + float percentile) { + size_t n = matrix_vectors.size(); + std::vector<std::pair<size_t, float>> distances(n); + for (size_t i = 0; i < n; i++) { + double sum = 0; + for (int j = 0; j < dim; j++) { + accumulate(matrix_vectors[i][j], vector[j], sum); + } + distances[i] = std::make_pair(i, finalize(sum)); + } + std::sort(distances.begin(), distances.end(), + [](const auto& a, const auto& b) { return a.second < b.second; }); + // Use the median distance as the radius + size_t percentile_index = static_cast<size_t>(n * percentile); + float radius = distances[percentile_index].second; + + return radius; +} +} // namespace doris::vector_search_utils \ No newline at end of file diff --git a/be/test/olap/vector_search/vector_search_utils.h b/be/test/olap/vector_search/vector_search_utils.h index b22e01b63b5..8fd79997819 100644 --- a/be/test/olap/vector_search/vector_search_utils.h +++ b/be/test/olap/vector_search/vector_search_utils.h @@ -20,6 +20,7 @@ #include <gen_cpp/Descriptors_types.h> #include <gen_cpp/Exprs_types.h> #include <gen_cpp/Types_types.h> +#include <gen_cpp/olap_file.pb.h> #include <gmock/gmock.h> #include <gtest/gtest.h> #include <stdint.h> @@ -36,12 +37,75 @@ #include "olap/tablet_schema.h" #include "runtime/descriptors.h" #include "vec/exprs/vexpr_context.h" +#include "vector_index.h" // Add CLucene RAM Directory header #include <CLucene/store/RAMDirectory.h> +#include <faiss/MetricType.h> using doris::segment_v2::DorisCompoundReader; -namespace doris::vec_search_mock { +namespace faiss { +class Index; +class IndexHNSWFlat; +} // namespace faiss + +namespace doris::segment_v2 { +class FaissVectorIndex; +} + +namespace doris::vector_search_utils { + +// Generate random vectors for testing +std::vector<float> generate_random_vector(int dim); +// Generate random vectors in matrix form +std::vector<std::vector<float>> generate_test_vectors_matrix(int num_vectors, int dimension); +// Generate random vectors as a flatten vector +std::vector<float> generate_test_vectors_flatten(int num_vectors, int dimension); + +// Enum for different index types +enum class IndexType { + FLAT_L2, + HNSW, + // Add more index types as needed +}; +std::unique_ptr<faiss::Index> create_native_index(IndexType type, int dimension, int m); +std::unique_ptr<doris::segment_v2::VectorIndex> create_doris_index(IndexType index_type, + int dimension, int m); + +// Helper function to add vectors to both Doris and native indexes +void add_vectors_to_indexes_serial_mode(segment_v2::VectorIndex* doris_index, + faiss::Index* native_index, + const std::vector<std::vector<float>>& vectors); + +void add_vectors_to_indexes_batch_mode(segment_v2::VectorIndex* doris_index, + faiss::Index* native_index, size_t num_vectors, + const std::vector<float>& flatten_vectors); + +void print_search_results(const segment_v2::IndexSearchResult& doris_results, + const std::vector<float>& native_distances, + const std::vector<faiss::idx_t>& native_indices, int query_idx); + +float get_radius_from_flatten(const float* vector, int dim, + const std::vector<float>& flatten_vectors, float percentile); +float get_radius_from_matrix(const float* vector, int dim, + const std::vector<std::vector<float>>& matrix_vectors, + float percentile); +// Helper function to compare search results between Doris and native Faiss +void compare_search_results(const segment_v2::IndexSearchResult& doris_results, + const std::vector<float>& native_distances, + const std::vector<faiss::idx_t>& native_indices, + float abs_error = 1e-5); + +// result is a vector of pairs, where each pair contains the labels and distance +// result is sorted by labels +std::vector<std::pair<int, float>> perform_native_index_range_search(faiss::Index* index, + const float* query_vector, + float radius); + +std::unique_ptr<doris::segment_v2::IndexSearchResult> perform_doris_index_range_search( + segment_v2::VectorIndex* index, const float* query_vector, float radius, + const segment_v2::IndexSearchParameters& params); + class MockIndexFileReader : public ::doris::segment_v2::IndexFileReader { public: MockIndexFileReader() @@ -83,8 +147,6 @@ public: ~MockAnnIndexIterator() override = default; - IndexType type() override { return IndexType::ANN; } - MOCK_METHOD(Status, read_from_index, (const doris::segment_v2::IndexParam& param), (override)); MOCK_METHOD(Status, range_search, (const segment_v2::RangeSearchParams& params, @@ -95,7 +157,7 @@ public: private: io::IOContext _io_ctx_mock; }; -} // namespace doris::vec_search_mock +} // namespace doris::vector_search_utils namespace doris::vectorized { template <typename T> @@ -189,7 +251,7 @@ protected: virtual_slot_ref_node.__isset.opcode = false; _virtual_slot_ref_expr.nodes.push_back(virtual_slot_ref_node); - _ann_index_iterator = std::make_unique<vec_search_mock::MockAnnIndexIterator>(); + _ann_index_iterator = std::make_unique<vector_search_utils::MockAnnIndexIterator>(); _row_desc = RowDescriptor(*_desc_tbl, {0}, {false}); @@ -210,7 +272,7 @@ protected: private: doris::ObjectPool obj_pool; RowDescriptor _row_desc; - std::unique_ptr<vec_search_mock::MockAnnIndexIterator> _ann_index_iterator; + std::unique_ptr<vector_search_utils::MockAnnIndexIterator> _ann_index_iterator; vectorized::IColumn::MutablePtr _result_column; doris::TExpr _virtual_slot_ref_expr; DescriptorTbl* _desc_tbl; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org