This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch vector-index-dev
in repository https://gitbox.apache.org/repos/asf/doris.git

commit bb93c7d53aa092c0242462003b94c1840f2112cd
Author: zhiqiang <hezhiqi...@selectdb.com>
AuthorDate: Tue May 27 21:37:14 2025 +0800

    [fix] VirtualColumnIterator should do scatter to input column in its 
prepare function. (#51299)
---
 be/src/olap/rowset/segment_v2/ann_index_writer.cpp |  2 +-
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |  5 +-
 .../rowset/segment_v2/virtual_column_iterator.cpp  | 67 +++++++++++++++----
 be/src/vec/exprs/vann_topn_predicate.cpp           |  6 +-
 be/src/vec/exprs/vectorized_fn_call.cpp            | 12 ++--
 .../vec/functions/array/function_array_distance.h  |  5 +-
 .../olap/vector_search/ann_range_search_test.cpp   |  8 +--
 .../vector_search/virtual_column_iterator_test.cpp | 76 ++++++++++++++++++++++
 8 files changed, 151 insertions(+), 30 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
index 0f8c6cd1d7c..ffc3b9bb1fd 100644
--- a/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index_writer.cpp
@@ -59,7 +59,7 @@ Status AnnIndexColumnWriter::init() {
     _vector_index = nullptr;
     const auto& properties = _index_meta->properties();
     const std::string index_type = get_or_default(properties, INDEX_TYPE, 
"hnsw");
-    const std::string metric_type = get_or_default(properties, METRIC_TYPE, 
"l2");
+    const std::string metric_type = get_or_default(properties, METRIC_TYPE, 
"l2_distance");
     const std::string quantilizer = get_or_default(properties, QUANTILIZER, 
"flat");
     FaissBuildParameter builderParameter;
     std::shared_ptr<FaissVectorIndex> faiss_index = 
std::make_shared<FaissVectorIndex>();
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 72748928357..0d971229c39 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1887,9 +1887,10 @@ Status SegmentIterator::_read_columns_by_index(uint32_t 
nrows_read_limit, uint32
     SCOPED_RAW_TIMER(&_opts.stats->predicate_column_read_ns);
 
     nrows_read = _range_iter->read_batch_rowids(_block_rowids.data(), 
nrows_read_limit);
-    LOG_INFO("nrows_read from range iterator: {}", nrows_read);
     bool is_continuous = (nrows_read > 1) &&
                          (_block_rowids[nrows_read - 1] - _block_rowids[0] == 
nrows_read - 1);
+    LOG_INFO("nrows_read from range iterator: {}, is_continus {}", nrows_read, 
is_continuous);
+
     std::vector<ColumnId> predicate_column_ids_and_virtual_columns;
     
predicate_column_ids_and_virtual_columns.reserve(_cols_read_by_column_predicate.size()
 +
                                                      
_virtual_column_exprs.size());
@@ -2927,7 +2928,6 @@ bool SegmentIterator::_can_opt_topn_reads() {
 
 Status SegmentIterator::_materialization_of_virtual_column(vectorized::Block* 
block) {
     size_t prev_block_columns = block->columns();
-    LOG_INFO("Materialize all {} virtual columns", 
_virtual_column_exprs.size());
     for (const auto& cid_and_expr : _virtual_column_exprs) {
         auto cid = cid_and_expr.first;
         auto column_expr = cid_and_expr.second;
@@ -2952,7 +2952,6 @@ Status 
SegmentIterator::_materialization_of_virtual_column(vectorized::Block* bl
     // During execution of expr, some columns may be added to the end of the 
block.
     // Remove them to keep consistent with current block.
     block->erase_tail(prev_block_columns);
-    LOG_INFO("Materialize all {} virtual columns end.", 
_virtual_column_exprs.size());
     return Status::OK();
 }
 
diff --git a/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp 
b/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp
index 82cb4631cd3..73285504624 100644
--- a/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/virtual_column_iterator.cpp
@@ -20,6 +20,7 @@
 #include <cstring>
 #include <memory>
 
+#include "gutil/integral_types.h"
 #include "vec/columns/column.h"
 #include "vec/columns/column_nothing.h"
 
@@ -36,16 +37,50 @@ Status VirtualColumnIterator::init(const 
ColumnIteratorOptions& opts) {
 
 void VirtualColumnIterator::prepare_materialization(vectorized::IColumn::Ptr 
column,
                                                     
std::unique_ptr<std::vector<uint64_t>> labels) {
-    _materialized_column_ptr = column;
+    DCHECK(labels->size() == column->size()) << "labels size: " << 
labels->size()
+                                             << ", materialized column size: " 
<< column->size();
+    // 1. do sort to labels
+    // column: [100, 101, 102, 99, 50, 49]
+    // lables: [5,   4,   1,   10, 7,  2]
+    const std::vector<uint64_t>& labels_ref = *labels;
+    const size_t n = labels_ref.size();
+    LOG_INFO("Input labels {}", fmt::join(labels_ref, ", "));
+    std::vector<size_t> order(n);
+    // global_row_id_to_idx:
+    // {5:0, 4:1, 1:2, 10:3, 7:4, 2:5}
+    std::map<size_t, size_t> global_row_id_to_idx;
+    for (size_t i = 0; i < n; ++i) {
+        order[i] = labels_ref[i];
+        global_row_id_to_idx[labels_ref[i]] = i;
+    }
+
+    // orders: [1,2,4,5,7,10]
+    std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { return a < 
b; });
+    LOG_INFO("Sorted order {}", fmt::join(order, ", "));
+    // 2. scatter column
+    auto scattered_column = column->clone_empty();
+    // We need a mapping from global row id to local index in the materialized 
column.
     _row_id_to_idx.clear();
-    DCHECK(labels->size() == _materialized_column_ptr->size())
-            << "labels size: " << labels->size()
-            << ", materialized column size: " << 
_materialized_column_ptr->size();
-    _size = _materialized_column_ptr->size();
-    for (size_t i = 0; i < _size; ++i) {
-        _row_id_to_idx[(*labels)[i]] = i;
+    for (size_t i = 0; i < n; ++i) {
+        size_t global_idx = order[i];
+        size_t original_col_idx = global_row_id_to_idx[global_idx];
+        _row_id_to_idx[global_idx] = i;
+        scattered_column->insert_from(*column, original_col_idx);
     }
 
+    // After scatter:
+    // scattered_column: [102, 49, 101, 100, 50, 99]
+    // _row_id_to_idx: {1:0, 2:1, 4:2, 5:3, 7:4, 10:5}
+    _materialized_column_ptr = std::move(scattered_column);
+
+    _size = n;
+
+    std::string msg;
+    for (const auto& pair : _row_id_to_idx) {
+        msg += fmt::format("{}: {}, ", pair.first, pair.second);
+    }
+
+    LOG_INFO("virtual column iterator, row_idx_to_idx:\n{}", msg);
     _filter = doris::vectorized::IColumn::Filter(_size, 0);
 }
 
@@ -86,10 +121,15 @@ Status VirtualColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr
         return Status::InternalError("Current ordinal {} not found in 
row_id_to_idx map",
                                      _current_ordinal);
     }
-    size_t start = _row_id_to_idx[_current_ordinal];
+
     // Update dst column
-    dst = _materialized_column_ptr->clone_empty();
-    dst->insert_range_from(*_materialized_column_ptr, start, rows_num_to_read);
+    if (vectorized::check_and_get_column<vectorized::ColumnNothing>(*dst)) {
+        LOG_INFO("Dst is nothing column, create new mutable column");
+        dst = _materialized_column_ptr->clone_resized(rows_num_to_read);
+    } else {
+        size_t start = _row_id_to_idx[_current_ordinal];
+        dst->insert_range_from(*_materialized_column_ptr, start, 
rows_num_to_read);
+    }
 
     LOG_INFO("Virtual column iterators, next_batch, rows reads: {}, dst size: 
{}", rows_num_to_read,
              dst->size());
@@ -114,7 +154,12 @@ Status VirtualColumnIterator::read_by_rowids(const 
rowid_t* rowids, const size_t
     // Apply filter to materialized column
     doris::vectorized::IColumn::Ptr res_col = 
_materialized_column_ptr->filter(_filter, 0);
     // Update dst column
-    dst = res_col->assume_mutable();
+    if (vectorized::check_and_get_column<vectorized::ColumnNothing>(*dst)) {
+        LOG_INFO("Dst is nothing column, create new mutable column");
+        dst = res_col->assume_mutable();
+    } else {
+        dst->insert_range_from(*res_col, 0, res_col->size());
+    }
 
     LOG_INFO("Virtual column iterators, read_by_rowids, rowids size: {}, dst 
size: {}", count,
              dst->size());
diff --git a/be/src/vec/exprs/vann_topn_predicate.cpp 
b/be/src/vec/exprs/vann_topn_predicate.cpp
index 30352b09782..1f8a9bd7047 100644
--- a/be/src/vec/exprs/vann_topn_predicate.cpp
+++ b/be/src/vec/exprs/vann_topn_predicate.cpp
@@ -158,12 +158,12 @@ Status AnnTopNDescriptor::evaluate_vector_ann_search(
     DCHECK(ann_query_params.row_ids != nullptr);
 
     result_column = ColumnFloat64::create();
-    ColumnFloat64* result_column_float = 
assert_cast<ColumnFloat64*>(result_column.get());
+    ColumnFloat64* result_column_double = 
assert_cast<ColumnFloat64*>(result_column.get());
 
     size_t num_results = ann_query_params.distance->size();
-    result_column_float->resize(num_results);
+    result_column_double->resize(num_results);
     for (size_t i = 0; i < num_results; ++i) {
-        result_column_float->get_data()[i] = (*ann_query_params.distance)[i];
+        result_column_double->get_data()[i] = (*ann_query_params.distance)[i];
     }
     row_ids = std::move(ann_query_params.row_ids);
     return Status::OK();
diff --git a/be/src/vec/exprs/vectorized_fn_call.cpp 
b/be/src/vec/exprs/vectorized_fn_call.cpp
index 9ed006195c3..c6fa1a45d55 100644
--- a/be/src/vec/exprs/vectorized_fn_call.cpp
+++ b/be/src/vec/exprs/vectorized_fn_call.cpp
@@ -525,11 +525,13 @@ Status VectorizedFnCall::evaluate_ann_range_search(
             DCHECK(virtual_column_iterator != nullptr);
             // Now convert distance to column
             size_t size = result.roaring->cardinality();
-            // TODO: need to consider nullable column.
-            auto distance_col = ColumnFloat32::create();
-
-            
distance_col->insert_many_raw_data(reinterpret_cast<char*>(result.distance.get()),
-                                               size);
+            auto distance_col = ColumnFloat64::create(size);
+            // float* -> double*,需要逐个转换
+            const float* src = reinterpret_cast<const 
float*>(result.distance.get());
+            double* dst = distance_col->get_data().data();
+            for (size_t i = 0; i < size; ++i) {
+                dst[i] = static_cast<double>(src[i]);
+            }
             
virtual_column_iterator->prepare_materialization(std::move(distance_col),
                                                              
std::move(result.row_ids));
         } else {
diff --git a/be/src/vec/functions/array/function_array_distance.h 
b/be/src/vec/functions/array/function_array_distance.h
index 99db74718cb..d0c43c4afb0 100644
--- a/be/src/vec/functions/array/function_array_distance.h
+++ b/be/src/vec/functions/array/function_array_distance.h
@@ -95,9 +95,8 @@ public:
 
     Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
                         uint32_t result, size_t input_rows_count) const 
override {
-        LOG_INFO("Function {} is executed with {} rows, stack {}", get_name(), 
input_rows_count,
-                 doris::get_stack_trace());
-
+        // LOG_INFO("Function {} is executed with {} rows, stack {}", 
get_name(), input_rows_count,
+        //          doris::get_stack_trace());
         const auto& arg1 = block.get_by_position(arguments[0]);
         const auto& arg2 = block.get_by_position(arguments[1]);
         if (!_check_input_type(arg1.type) || !_check_input_type(arg2.type)) {
diff --git a/be/test/olap/vector_search/ann_range_search_test.cpp 
b/be/test/olap/vector_search/ann_range_search_test.cpp
index f5599526b38..475dd2dad17 100644
--- a/be/test/olap/vector_search/ann_range_search_test.cpp
+++ b/be/test/olap/vector_search/ann_range_search_test.cpp
@@ -1028,13 +1028,13 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch2) {
             
dynamic_cast<doris::segment_v2::VirtualColumnIterator*>(column_iterators[3].get());
 
     vectorized::IColumn::Ptr column = 
virtual_column_iter->get_materialized_column();
-    const vectorized::ColumnFloat32* float_column =
-            check_and_get_column<const 
vectorized::ColumnFloat32>(column.get());
+    const vectorized::ColumnFloat64* double_column =
+            check_and_get_column<const 
vectorized::ColumnFloat64>(column.get());
     const vectorized::ColumnNothing* nothing_column =
             check_and_get_column<const 
vectorized::ColumnNothing>(column.get());
-    ASSERT_NE(float_column, nullptr);
+    ASSERT_NE(double_column, nullptr);
     ASSERT_EQ(nothing_column, nullptr);
-    EXPECT_EQ(float_column->size(), 10);
+    EXPECT_EQ(double_column->size(), 10);
     EXPECT_EQ(row_bitmap.cardinality(), 10);
 
     const auto& get_row_id_to_idx = virtual_column_iter->get_row_id_to_idx();
diff --git a/be/test/olap/vector_search/virtual_column_iterator_test.cpp 
b/be/test/olap/vector_search/virtual_column_iterator_test.cpp
index d860a7a7ad8..0c3f4a73169 100644
--- a/be/test/olap/vector_search/virtual_column_iterator_test.cpp
+++ b/be/test/olap/vector_search/virtual_column_iterator_test.cpp
@@ -20,6 +20,7 @@
 #include <gtest/gtest.h>
 
 #include "vec/columns/column.h"
+#include "vec/columns/column_nothing.h"
 #include "vec/columns/column_string.h"
 #include "vec/columns/column_vector.h"
 #include "vec/core/types.h"
@@ -344,4 +345,79 @@ TEST_F(VectorSearchTest, NextBatchTest1) {
     }
 }
 
+TEST_F(VectorSearchTest, TestPrepare1) {
+    VirtualColumnIterator iterator;
+
+    // Create a materialized int32_t column with values [10, 20, 30, 40, 50]
+    auto int_column = vectorized::ColumnVector<int32_t>::create();
+    int_column->insert(10);
+    int_column->insert(20);
+    int_column->insert(30);
+    int_column->insert(40);
+    int_column->insert(50);
+    auto labels = std::make_unique<std::vector<uint64_t>>();
+    labels->push_back(100);
+    labels->push_back(11);
+    labels->push_back(33);
+    labels->push_back(22);
+    labels->push_back(55);
+    // Set the materialized column
+    iterator.prepare_materialization(std::move(int_column), std::move(labels));
+
+    // Verify row_id_to_idx mapping
+    const auto& row_id_to_idx = iterator.get_row_id_to_idx();
+    ASSERT_EQ(row_id_to_idx.size(), 5);
+    ASSERT_EQ(row_id_to_idx.find(11)->second, 0);
+    ASSERT_EQ(row_id_to_idx.find(22)->second, 1);
+    ASSERT_EQ(row_id_to_idx.find(33)->second, 2);
+    ASSERT_EQ(row_id_to_idx.find(55)->second, 3);
+    ASSERT_EQ(row_id_to_idx.find(100)->second, 4);
+
+    auto materialization_col = iterator.get_materialized_column();
+    auto int_col_m =
+            assert_cast<const 
vectorized::ColumnVector<int32_t>*>(materialization_col.get());
+    ASSERT_EQ(int_col_m->get_data()[0], 20);
+    ASSERT_EQ(int_col_m->get_data()[1], 40);
+    ASSERT_EQ(int_col_m->get_data()[2], 30);
+    ASSERT_EQ(int_col_m->get_data()[3], 50);
+    ASSERT_EQ(int_col_m->get_data()[4], 10);
+}
+
+TEST_F(VectorSearchTest, TestColumnNothing) {
+    VirtualColumnIterator iterator;
+
+    // Create a materialized int32_t column with values [10, 20, 30, 40, 50]
+    auto int_column = vectorized::ColumnVector<int32_t>::create();
+    int_column->insert(10);
+    int_column->insert(20);
+    int_column->insert(30);
+    int_column->insert(40);
+    int_column->insert(50);
+    auto labels = std::make_unique<std::vector<uint64_t>>();
+    labels->push_back(100);
+    labels->push_back(11);
+    labels->push_back(33);
+    labels->push_back(22);
+    labels->push_back(55);
+    // Set the materialized column
+    iterator.prepare_materialization(std::move(int_column), std::move(labels));
+
+    // Create destination column
+    vectorized::MutableColumnPtr dst = vectorized::ColumnNothing::create(0);
+
+    // Read by rowids, should return empty result
+    rowid_t rowids[] = {11, 22, 33};
+    size_t count = sizeof(rowids) / sizeof(rowids[0]);
+    Status status = iterator.read_by_rowids(rowids, count, dst);
+    ASSERT_TRUE(status.ok());
+    auto tmp_nothing = 
vectorized::check_and_get_column<vectorized::ColumnNothing>(*dst);
+    ASSERT_TRUE(tmp_nothing == nullptr);
+    auto tmp_col_i32 = 
vectorized::check_and_get_column<vectorized::ColumnVector<int32_t>>(
+            *iterator.get_materialized_column());
+    ASSERT_TRUE(tmp_col_i32 != nullptr);
+    ASSERT_EQ(dst->size(), 3);
+    ASSERT_EQ(tmp_col_i32->get_data()[0], 20);
+    ASSERT_EQ(tmp_col_i32->get_data()[1], 40);
+    ASSERT_EQ(tmp_col_i32->get_data()[2], 30);
+}
 } // namespace doris::vectorized
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to