This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch vector-index-dev in repository https://gitbox.apache.org/repos/asf/doris.git
commit bb93c7d53aa092c0242462003b94c1840f2112cd Author: zhiqiang <hezhiqi...@selectdb.com> AuthorDate: Tue May 27 21:37:14 2025 +0800 [fix] VirtualColumnIterator should do scatter to input column in its prepare function. (#51299) --- be/src/olap/rowset/segment_v2/ann_index_writer.cpp | 2 +- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 5 +- .../rowset/segment_v2/virtual_column_iterator.cpp | 67 +++++++++++++++---- be/src/vec/exprs/vann_topn_predicate.cpp | 6 +- be/src/vec/exprs/vectorized_fn_call.cpp | 12 ++-- .../vec/functions/array/function_array_distance.h | 5 +- .../olap/vector_search/ann_range_search_test.cpp | 8 +-- .../vector_search/virtual_column_iterator_test.cpp | 76 ++++++++++++++++++++++ 8 files changed, 151 insertions(+), 30 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp index 0f8c6cd1d7c..ffc3b9bb1fd 100644 --- a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp @@ -59,7 +59,7 @@ Status AnnIndexColumnWriter::init() { _vector_index = nullptr; const auto& properties = _index_meta->properties(); const std::string index_type = get_or_default(properties, INDEX_TYPE, "hnsw"); - const std::string metric_type = get_or_default(properties, METRIC_TYPE, "l2"); + const std::string metric_type = get_or_default(properties, METRIC_TYPE, "l2_distance"); const std::string quantilizer = get_or_default(properties, QUANTILIZER, "flat"); FaissBuildParameter builderParameter; std::shared_ptr<FaissVectorIndex> faiss_index = std::make_shared<FaissVectorIndex>(); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 72748928357..0d971229c39 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1887,9 +1887,10 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32 SCOPED_RAW_TIMER(&_opts.stats->predicate_column_read_ns); nrows_read = _range_iter->read_batch_rowids(_block_rowids.data(), nrows_read_limit); - LOG_INFO("nrows_read from range iterator: {}", nrows_read); bool is_continuous = (nrows_read > 1) && (_block_rowids[nrows_read - 1] - _block_rowids[0] == nrows_read - 1); + LOG_INFO("nrows_read from range iterator: {}, is_continus {}", nrows_read, is_continuous); + std::vector<ColumnId> predicate_column_ids_and_virtual_columns; predicate_column_ids_and_virtual_columns.reserve(_cols_read_by_column_predicate.size() + _virtual_column_exprs.size()); @@ -2927,7 +2928,6 @@ bool SegmentIterator::_can_opt_topn_reads() { Status SegmentIterator::_materialization_of_virtual_column(vectorized::Block* block) { size_t prev_block_columns = block->columns(); - LOG_INFO("Materialize all {} virtual columns", _virtual_column_exprs.size()); for (const auto& cid_and_expr : _virtual_column_exprs) { auto cid = cid_and_expr.first; auto column_expr = cid_and_expr.second; @@ -2952,7 +2952,6 @@ Status SegmentIterator::_materialization_of_virtual_column(vectorized::Block* bl // During execution of expr, some columns may be added to the end of the block. // Remove them to keep consistent with current block. block->erase_tail(prev_block_columns); - LOG_INFO("Materialize all {} virtual columns end.", _virtual_column_exprs.size()); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp b/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp index 82cb4631cd3..73285504624 100644 --- a/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp @@ -20,6 +20,7 @@ #include <cstring> #include <memory> +#include "gutil/integral_types.h" #include "vec/columns/column.h" #include "vec/columns/column_nothing.h" @@ -36,16 +37,50 @@ Status VirtualColumnIterator::init(const ColumnIteratorOptions& opts) { void VirtualColumnIterator::prepare_materialization(vectorized::IColumn::Ptr column, std::unique_ptr<std::vector<uint64_t>> labels) { - _materialized_column_ptr = column; + DCHECK(labels->size() == column->size()) << "labels size: " << labels->size() + << ", materialized column size: " << column->size(); + // 1. do sort to labels + // column: [100, 101, 102, 99, 50, 49] + // lables: [5, 4, 1, 10, 7, 2] + const std::vector<uint64_t>& labels_ref = *labels; + const size_t n = labels_ref.size(); + LOG_INFO("Input labels {}", fmt::join(labels_ref, ", ")); + std::vector<size_t> order(n); + // global_row_id_to_idx: + // {5:0, 4:1, 1:2, 10:3, 7:4, 2:5} + std::map<size_t, size_t> global_row_id_to_idx; + for (size_t i = 0; i < n; ++i) { + order[i] = labels_ref[i]; + global_row_id_to_idx[labels_ref[i]] = i; + } + + // orders: [1,2,4,5,7,10] + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { return a < b; }); + LOG_INFO("Sorted order {}", fmt::join(order, ", ")); + // 2. scatter column + auto scattered_column = column->clone_empty(); + // We need a mapping from global row id to local index in the materialized column. _row_id_to_idx.clear(); - DCHECK(labels->size() == _materialized_column_ptr->size()) - << "labels size: " << labels->size() - << ", materialized column size: " << _materialized_column_ptr->size(); - _size = _materialized_column_ptr->size(); - for (size_t i = 0; i < _size; ++i) { - _row_id_to_idx[(*labels)[i]] = i; + for (size_t i = 0; i < n; ++i) { + size_t global_idx = order[i]; + size_t original_col_idx = global_row_id_to_idx[global_idx]; + _row_id_to_idx[global_idx] = i; + scattered_column->insert_from(*column, original_col_idx); } + // After scatter: + // scattered_column: [102, 49, 101, 100, 50, 99] + // _row_id_to_idx: {1:0, 2:1, 4:2, 5:3, 7:4, 10:5} + _materialized_column_ptr = std::move(scattered_column); + + _size = n; + + std::string msg; + for (const auto& pair : _row_id_to_idx) { + msg += fmt::format("{}: {}, ", pair.first, pair.second); + } + + LOG_INFO("virtual column iterator, row_idx_to_idx:\n{}", msg); _filter = doris::vectorized::IColumn::Filter(_size, 0); } @@ -86,10 +121,15 @@ Status VirtualColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr return Status::InternalError("Current ordinal {} not found in row_id_to_idx map", _current_ordinal); } - size_t start = _row_id_to_idx[_current_ordinal]; + // Update dst column - dst = _materialized_column_ptr->clone_empty(); - dst->insert_range_from(*_materialized_column_ptr, start, rows_num_to_read); + if (vectorized::check_and_get_column<vectorized::ColumnNothing>(*dst)) { + LOG_INFO("Dst is nothing column, create new mutable column"); + dst = _materialized_column_ptr->clone_resized(rows_num_to_read); + } else { + size_t start = _row_id_to_idx[_current_ordinal]; + dst->insert_range_from(*_materialized_column_ptr, start, rows_num_to_read); + } LOG_INFO("Virtual column iterators, next_batch, rows reads: {}, dst size: {}", rows_num_to_read, dst->size()); @@ -114,7 +154,12 @@ Status VirtualColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t // Apply filter to materialized column doris::vectorized::IColumn::Ptr res_col = _materialized_column_ptr->filter(_filter, 0); // Update dst column - dst = res_col->assume_mutable(); + if (vectorized::check_and_get_column<vectorized::ColumnNothing>(*dst)) { + LOG_INFO("Dst is nothing column, create new mutable column"); + dst = res_col->assume_mutable(); + } else { + dst->insert_range_from(*res_col, 0, res_col->size()); + } LOG_INFO("Virtual column iterators, read_by_rowids, rowids size: {}, dst size: {}", count, dst->size()); diff --git a/be/src/vec/exprs/vann_topn_predicate.cpp b/be/src/vec/exprs/vann_topn_predicate.cpp index 30352b09782..1f8a9bd7047 100644 --- a/be/src/vec/exprs/vann_topn_predicate.cpp +++ b/be/src/vec/exprs/vann_topn_predicate.cpp @@ -158,12 +158,12 @@ Status AnnTopNDescriptor::evaluate_vector_ann_search( DCHECK(ann_query_params.row_ids != nullptr); result_column = ColumnFloat64::create(); - ColumnFloat64* result_column_float = assert_cast<ColumnFloat64*>(result_column.get()); + ColumnFloat64* result_column_double = assert_cast<ColumnFloat64*>(result_column.get()); size_t num_results = ann_query_params.distance->size(); - result_column_float->resize(num_results); + result_column_double->resize(num_results); for (size_t i = 0; i < num_results; ++i) { - result_column_float->get_data()[i] = (*ann_query_params.distance)[i]; + result_column_double->get_data()[i] = (*ann_query_params.distance)[i]; } row_ids = std::move(ann_query_params.row_ids); return Status::OK(); diff --git a/be/src/vec/exprs/vectorized_fn_call.cpp b/be/src/vec/exprs/vectorized_fn_call.cpp index 9ed006195c3..c6fa1a45d55 100644 --- a/be/src/vec/exprs/vectorized_fn_call.cpp +++ b/be/src/vec/exprs/vectorized_fn_call.cpp @@ -525,11 +525,13 @@ Status VectorizedFnCall::evaluate_ann_range_search( DCHECK(virtual_column_iterator != nullptr); // Now convert distance to column size_t size = result.roaring->cardinality(); - // TODO: need to consider nullable column. - auto distance_col = ColumnFloat32::create(); - - distance_col->insert_many_raw_data(reinterpret_cast<char*>(result.distance.get()), - size); + auto distance_col = ColumnFloat64::create(size); + // float* -> double*,需要逐个转换 + const float* src = reinterpret_cast<const float*>(result.distance.get()); + double* dst = distance_col->get_data().data(); + for (size_t i = 0; i < size; ++i) { + dst[i] = static_cast<double>(src[i]); + } virtual_column_iterator->prepare_materialization(std::move(distance_col), std::move(result.row_ids)); } else { diff --git a/be/src/vec/functions/array/function_array_distance.h b/be/src/vec/functions/array/function_array_distance.h index 99db74718cb..d0c43c4afb0 100644 --- a/be/src/vec/functions/array/function_array_distance.h +++ b/be/src/vec/functions/array/function_array_distance.h @@ -95,9 +95,8 @@ public: Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, uint32_t result, size_t input_rows_count) const override { - LOG_INFO("Function {} is executed with {} rows, stack {}", get_name(), input_rows_count, - doris::get_stack_trace()); - + // LOG_INFO("Function {} is executed with {} rows, stack {}", get_name(), input_rows_count, + // doris::get_stack_trace()); const auto& arg1 = block.get_by_position(arguments[0]); const auto& arg2 = block.get_by_position(arguments[1]); if (!_check_input_type(arg1.type) || !_check_input_type(arg2.type)) { diff --git a/be/test/olap/vector_search/ann_range_search_test.cpp b/be/test/olap/vector_search/ann_range_search_test.cpp index f5599526b38..475dd2dad17 100644 --- a/be/test/olap/vector_search/ann_range_search_test.cpp +++ b/be/test/olap/vector_search/ann_range_search_test.cpp @@ -1028,13 +1028,13 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch2) { dynamic_cast<doris::segment_v2::VirtualColumnIterator*>(column_iterators[3].get()); vectorized::IColumn::Ptr column = virtual_column_iter->get_materialized_column(); - const vectorized::ColumnFloat32* float_column = - check_and_get_column<const vectorized::ColumnFloat32>(column.get()); + const vectorized::ColumnFloat64* double_column = + check_and_get_column<const vectorized::ColumnFloat64>(column.get()); const vectorized::ColumnNothing* nothing_column = check_and_get_column<const vectorized::ColumnNothing>(column.get()); - ASSERT_NE(float_column, nullptr); + ASSERT_NE(double_column, nullptr); ASSERT_EQ(nothing_column, nullptr); - EXPECT_EQ(float_column->size(), 10); + EXPECT_EQ(double_column->size(), 10); EXPECT_EQ(row_bitmap.cardinality(), 10); const auto& get_row_id_to_idx = virtual_column_iter->get_row_id_to_idx(); diff --git a/be/test/olap/vector_search/virtual_column_iterator_test.cpp b/be/test/olap/vector_search/virtual_column_iterator_test.cpp index d860a7a7ad8..0c3f4a73169 100644 --- a/be/test/olap/vector_search/virtual_column_iterator_test.cpp +++ b/be/test/olap/vector_search/virtual_column_iterator_test.cpp @@ -20,6 +20,7 @@ #include <gtest/gtest.h> #include "vec/columns/column.h" +#include "vec/columns/column_nothing.h" #include "vec/columns/column_string.h" #include "vec/columns/column_vector.h" #include "vec/core/types.h" @@ -344,4 +345,79 @@ TEST_F(VectorSearchTest, NextBatchTest1) { } } +TEST_F(VectorSearchTest, TestPrepare1) { + VirtualColumnIterator iterator; + + // Create a materialized int32_t column with values [10, 20, 30, 40, 50] + auto int_column = vectorized::ColumnVector<int32_t>::create(); + int_column->insert(10); + int_column->insert(20); + int_column->insert(30); + int_column->insert(40); + int_column->insert(50); + auto labels = std::make_unique<std::vector<uint64_t>>(); + labels->push_back(100); + labels->push_back(11); + labels->push_back(33); + labels->push_back(22); + labels->push_back(55); + // Set the materialized column + iterator.prepare_materialization(std::move(int_column), std::move(labels)); + + // Verify row_id_to_idx mapping + const auto& row_id_to_idx = iterator.get_row_id_to_idx(); + ASSERT_EQ(row_id_to_idx.size(), 5); + ASSERT_EQ(row_id_to_idx.find(11)->second, 0); + ASSERT_EQ(row_id_to_idx.find(22)->second, 1); + ASSERT_EQ(row_id_to_idx.find(33)->second, 2); + ASSERT_EQ(row_id_to_idx.find(55)->second, 3); + ASSERT_EQ(row_id_to_idx.find(100)->second, 4); + + auto materialization_col = iterator.get_materialized_column(); + auto int_col_m = + assert_cast<const vectorized::ColumnVector<int32_t>*>(materialization_col.get()); + ASSERT_EQ(int_col_m->get_data()[0], 20); + ASSERT_EQ(int_col_m->get_data()[1], 40); + ASSERT_EQ(int_col_m->get_data()[2], 30); + ASSERT_EQ(int_col_m->get_data()[3], 50); + ASSERT_EQ(int_col_m->get_data()[4], 10); +} + +TEST_F(VectorSearchTest, TestColumnNothing) { + VirtualColumnIterator iterator; + + // Create a materialized int32_t column with values [10, 20, 30, 40, 50] + auto int_column = vectorized::ColumnVector<int32_t>::create(); + int_column->insert(10); + int_column->insert(20); + int_column->insert(30); + int_column->insert(40); + int_column->insert(50); + auto labels = std::make_unique<std::vector<uint64_t>>(); + labels->push_back(100); + labels->push_back(11); + labels->push_back(33); + labels->push_back(22); + labels->push_back(55); + // Set the materialized column + iterator.prepare_materialization(std::move(int_column), std::move(labels)); + + // Create destination column + vectorized::MutableColumnPtr dst = vectorized::ColumnNothing::create(0); + + // Read by rowids, should return empty result + rowid_t rowids[] = {11, 22, 33}; + size_t count = sizeof(rowids) / sizeof(rowids[0]); + Status status = iterator.read_by_rowids(rowids, count, dst); + ASSERT_TRUE(status.ok()); + auto tmp_nothing = vectorized::check_and_get_column<vectorized::ColumnNothing>(*dst); + ASSERT_TRUE(tmp_nothing == nullptr); + auto tmp_col_i32 = vectorized::check_and_get_column<vectorized::ColumnVector<int32_t>>( + *iterator.get_materialized_column()); + ASSERT_TRUE(tmp_col_i32 != nullptr); + ASSERT_EQ(dst->size(), 3); + ASSERT_EQ(tmp_col_i32->get_data()[0], 20); + ASSERT_EQ(tmp_col_i32->get_data()[1], 40); + ASSERT_EQ(tmp_col_i32->get_data()[2], 30); +} } // namespace doris::vectorized \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org