This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new f1cc4a373fd [fix](ann-index) Fix ANN range search state leakage and
incorrect slot index tracking. (#63965)
f1cc4a373fd is described below
commit f1cc4a373fdc7303fcfa7af4481ed400ef47269b
Author: Qi Chen <[email protected]>
AuthorDate: Tue Jun 2 12:05:39 2026 +0800
[fix](ann-index) Fix ANN range search state leakage and incorrect slot
index tracking. (#63965)
### What problem does this PR solve?
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
Cherrypick #63666
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [ ] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [ ] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [ ] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/exprs/vectorized_fn_call.cpp | 19 +-
be/src/exprs/vectorized_fn_call.h | 3 +-
be/src/exprs/vexpr.cpp | 12 +-
be/src/exprs/vexpr.h | 23 ++-
be/src/exprs/vexpr_context.cpp | 37 +++-
be/src/exprs/vexpr_context.h | 3 +-
be/src/exprs/virtual_slot_ref.cpp | 7 +-
be/src/exprs/virtual_slot_ref.h | 5 +-
be/src/storage/segment/segment_iterator.cpp | 15 +-
.../storage/index/ann/ann_range_search_test.cpp | 210 ++++++++++++++++++++-
.../ann_range_search_pushdown_regression.groovy | 141 ++++++++++++++
...ge_search_source_index_status_regression.groovy | 84 +++++++++
12 files changed, 500 insertions(+), 59 deletions(-)
diff --git a/be/src/exprs/vectorized_fn_call.cpp
b/be/src/exprs/vectorized_fn_call.cpp
index a190891bdad..e28c08422db 100644
--- a/be/src/exprs/vectorized_fn_call.cpp
+++ b/be/src/exprs/vectorized_fn_call.cpp
@@ -535,7 +535,9 @@ Status VectorizedFnCall::evaluate_ann_range_search(
const std::vector<std::unique_ptr<segment_v2::IndexIterator>>&
cid_to_index_iterators,
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
- roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats) {
+ roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
+ AnnRangeSearchEvaluationResult& evaluation_result) {
+ evaluation_result = {};
if (range_search_runtime.is_ann_range_search == false) {
return Status::OK();
}
@@ -544,8 +546,8 @@ Status VectorizedFnCall::evaluate_ann_range_search(
range_search_runtime.to_string());
size_t origin_num = row_bitmap.cardinality();
- int idx_in_block = static_cast<int>(range_search_runtime.src_col_idx);
- DCHECK(idx_in_block < idx_to_cid.size())
+ const auto idx_in_block = range_search_runtime.src_col_idx;
+ DCHECK_LT(idx_in_block, idx_to_cid.size())
<< "idx_in_block: " << idx_in_block << ", idx_to_cid.size(): " <<
idx_to_cid.size();
ColumnId src_col_cid = idx_to_cid[idx_in_block];
@@ -626,6 +628,7 @@ Status VectorizedFnCall::evaluate_ann_range_search(
row_bitmap = *result.roaring;
// Process virtual column
+ bool dist_fulfilled = false;
if (range_search_runtime.dst_col_idx >= 0) {
// Prepare materialization if we can use result from index.
// Typical situation: range search and operator is LE or LT.
@@ -649,7 +652,7 @@ Status VectorizedFnCall::evaluate_ann_range_search(
}
virtual_column_iterator->prepare_materialization(std::move(distance_col),
std::move(result.row_ids));
- _virtual_column_is_fulfilled = true;
+ dist_fulfilled = true;
} else {
// Whether the ANN index should have produced distance depends on
metric and operator:
// - L2: distance is produced for LE/LT; not produced for GE/GT
@@ -663,17 +666,17 @@ Status VectorizedFnCall::evaluate_ann_range_search(
// If we expected distance but didn't get it, assert in debug to
catch logic errors.
DCHECK(!should_have_distance) << "Expected distance from ANN index
but got none";
#endif
- _virtual_column_is_fulfilled = false;
}
} else {
// Dest is not virtual column.
- _virtual_column_is_fulfilled = true;
+ dist_fulfilled = true;
}
- _has_been_executed = true;
+ evaluation_result.executed = true;
+ evaluation_result.dist_fulfilled = dist_fulfilled;
VLOG_DEBUG << fmt::format(
"Ann range search filtered {} rows, origin {} rows, virtual column
is full-filled: {}",
- origin_num - row_bitmap.cardinality(), origin_num,
_virtual_column_is_fulfilled);
+ origin_num - row_bitmap.cardinality(), origin_num, dist_fulfilled);
ann_index_stats = *stats;
return Status::OK();
diff --git a/be/src/exprs/vectorized_fn_call.h
b/be/src/exprs/vectorized_fn_call.h
index 979a0d5c1f8..f90206c6d81 100644
--- a/be/src/exprs/vectorized_fn_call.h
+++ b/be/src/exprs/vectorized_fn_call.h
@@ -92,7 +92,8 @@ public:
const std::vector<std::unique_ptr<segment_v2::IndexIterator>>&
cid_to_index_iterators,
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
- roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats) override;
+ roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
+ AnnRangeSearchEvaluationResult& result) override;
void prepare_ann_range_search(const doris::VectorSearchUserParams& params,
segment_v2::AnnRangeSearchRuntime& runtime,
diff --git a/be/src/exprs/vexpr.cpp b/be/src/exprs/vexpr.cpp
index 7ab49cb4349..5c80aecede0 100644
--- a/be/src/exprs/vexpr.cpp
+++ b/be/src/exprs/vexpr.cpp
@@ -1017,7 +1017,9 @@ Status VExpr::evaluate_ann_range_search(
const std::vector<std::unique_ptr<segment_v2::IndexIterator>>&
index_iterators,
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
- roaring::Roaring& row_bitmap, AnnIndexStats& ann_index_stats) {
+ roaring::Roaring& row_bitmap, AnnIndexStats& ann_index_stats,
+ AnnRangeSearchEvaluationResult& result) {
+ result = {};
return Status::OK();
}
@@ -1035,14 +1037,6 @@ void VExpr::prepare_ann_range_search(const
doris::VectorSearchUserParams& params
}
}
-bool VExpr::ann_range_search_executedd() {
- return _has_been_executed;
-}
-
-bool VExpr::ann_dist_is_fulfilled() const {
- return _virtual_column_is_fulfilled;
-}
-
Status VExpr::execute_filter(VExprContext* context, const Block* block,
uint8_t* __restrict result_filter_data, size_t
rows, bool accept_null,
bool* can_filter_all) const {
diff --git a/be/src/exprs/vexpr.h b/be/src/exprs/vexpr.h
index 011c3a40dda..196ffd1b082 100644
--- a/be/src/exprs/vexpr.h
+++ b/be/src/exprs/vexpr.h
@@ -80,6 +80,15 @@ struct AnnRangeSearchRuntime;
using Selector = IColumn::Selector;
+struct AnnRangeSearchEvaluationResult {
+ // Indicates whether the expr row_bitmap has been updated.
+ bool executed = false;
+ // Indicates whether the virtual column is fulfilled.
+ // NOTE, if there is no virtual column in the expr tree, and expr
+ // is evaluated by ann index, this flag is still true.
+ bool dist_fulfilled = false;
+};
+
class VExpr {
public:
// resize inserted param column to make sure column size equal to
block.rows() and return param column index
@@ -343,7 +352,8 @@ public:
const std::vector<std::unique_ptr<segment_v2::IndexIterator>>&
cid_to_index_iterators,
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
- roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats);
+ roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
+ AnnRangeSearchEvaluationResult& result);
// Prepare the runtime for ANN range search.
// AnnRangeSearchRuntime is used to store the runtime information of ann
range search.
@@ -353,10 +363,6 @@ public:
segment_v2::AnnRangeSearchRuntime&
range_search_runtime,
bool& suitable_for_ann_index);
- bool ann_range_search_executedd();
-
- bool ann_dist_is_fulfilled() const;
-
virtual uint64_t get_digest(uint64_t seed) const;
protected:
@@ -439,13 +445,6 @@ protected:
// ensuring uniqueness during index traversal
uint32_t _index_unique_id = 0;
bool _enable_inverted_index_query = true;
-
- // Indicates whether the expr row_bitmap has been updated.
- bool _has_been_executed = false;
- // Indicates whether the virtual column is fulfilled.
- // NOTE, if there is no virtual column in the expr tree, and expr
- // is evaluated by ann index, this flag is still true.
- bool _virtual_column_is_fulfilled = false;
};
// NOLINTBEGIN(readability-function-size)
diff --git a/be/src/exprs/vexpr_context.cpp b/be/src/exprs/vexpr_context.cpp
index b6e5c2a08e8..29e0b25cf49 100644
--- a/be/src/exprs/vexpr_context.cpp
+++ b/be/src/exprs/vexpr_context.cpp
@@ -433,41 +433,58 @@ Status VExprContext::evaluate_ann_range_search(
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
const std::unordered_map<VExprContext*, std::unordered_map<ColumnId,
VExpr*>>&
common_expr_to_slotref_map,
- roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats) {
+ roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
+ bool* ann_range_search_executed) {
+ if (ann_range_search_executed != nullptr) {
+ *ann_range_search_executed = false;
+ }
if (_root == nullptr) {
return Status::OK();
}
+ AnnRangeSearchEvaluationResult evaluation_result;
RETURN_IF_ERROR(_root->evaluate_ann_range_search(
_ann_range_search_runtime, cid_to_index_iterators, idx_to_cid,
column_iterators,
- row_bitmap, ann_index_stats));
+ row_bitmap, ann_index_stats, evaluation_result));
- if (!_root->ann_range_search_executedd()) {
+ if (!evaluation_result.executed) {
return Status::OK();
}
+ if (ann_range_search_executed != nullptr) {
+ *ann_range_search_executed = true;
+ }
+
+ DCHECK(_index_context != nullptr);
+ _index_context->set_index_result_for_expr(
+ _root.get(),
+
segment_v2::InvertedIndexResultBitmap(std::make_shared<roaring::Roaring>(row_bitmap),
+
std::make_shared<roaring::Roaring>()));
- if (!_root->ann_dist_is_fulfilled()) {
+ if (!evaluation_result.dist_fulfilled) {
// Do not perform index scan in this case.
return Status::OK();
}
- auto src_col_idx = _ann_range_search_runtime.src_col_idx;
+ DCHECK_LT(_ann_range_search_runtime.src_col_idx, idx_to_cid.size());
+ const auto src_col_idx =
cast_set<int>(_ann_range_search_runtime.src_col_idx);
+ const auto src_col_key =
cast_set<ColumnId>(_ann_range_search_runtime.src_col_idx);
auto slot_ref_map_it = common_expr_to_slotref_map.find(this);
if (slot_ref_map_it == common_expr_to_slotref_map.end()) {
return Status::OK();
}
auto& slot_ref_map = slot_ref_map_it->second;
- ColumnId cid = idx_to_cid[src_col_idx];
- if (slot_ref_map.find(cid) == slot_ref_map.end()) {
+ auto slot_ref_it = slot_ref_map.find(src_col_key);
+ if (slot_ref_it == slot_ref_map.end()) {
return Status::OK();
}
- const VExpr* slot_ref_expr_addr = slot_ref_map.find(cid)->second;
- _index_context->set_true_for_index_status(slot_ref_expr_addr,
idx_to_cid[cid]);
+ const VExpr* slot_ref_expr_addr = slot_ref_it->second;
+ _index_context->set_true_for_index_status(slot_ref_expr_addr, src_col_idx);
VLOG_DEBUG << fmt::format(
"Evaluate ann range search for expr {}, src_col_idx {}, cid {},
row_bitmap "
"cardinality {}",
- _root->debug_string(), src_col_idx, cid, row_bitmap.cardinality());
+ _root->debug_string(), src_col_idx,
idx_to_cid[_ann_range_search_runtime.src_col_idx],
+ row_bitmap.cardinality());
return Status::OK();
}
diff --git a/be/src/exprs/vexpr_context.h b/be/src/exprs/vexpr_context.h
index fd4c42fe85a..ccd385d3a93 100644
--- a/be/src/exprs/vexpr_context.h
+++ b/be/src/exprs/vexpr_context.h
@@ -395,7 +395,8 @@ public:
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
const std::unordered_map<VExprContext*,
std::unordered_map<ColumnId, VExpr*>>&
common_expr_to_slotref_map,
- roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats);
+ roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
+ bool* ann_range_search_executed);
uint64_t get_digest(uint64_t seed) const;
diff --git a/be/src/exprs/virtual_slot_ref.cpp
b/be/src/exprs/virtual_slot_ref.cpp
index f7f767192ac..66448b39af1 100644
--- a/be/src/exprs/virtual_slot_ref.cpp
+++ b/be/src/exprs/virtual_slot_ref.cpp
@@ -234,12 +234,11 @@ Status VirtualSlotRef::evaluate_ann_range_search(
const std::vector<std::unique_ptr<segment_v2::IndexIterator>>&
cid_to_index_iterators,
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
- roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats) {
+ roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
+ AnnRangeSearchEvaluationResult& result) {
return _virtual_column_expr->evaluate_ann_range_search(
range_search_runtime, cid_to_index_iterators, idx_to_cid,
column_iterators, row_bitmap,
- ann_index_stats);
-
- return Status::OK();
+ ann_index_stats, result);
}
#include "common/compile_check_end.h"
} // namespace doris
diff --git a/be/src/exprs/virtual_slot_ref.h b/be/src/exprs/virtual_slot_ref.h
index e604d593376..382a3b45e34 100644
--- a/be/src/exprs/virtual_slot_ref.h
+++ b/be/src/exprs/virtual_slot_ref.h
@@ -107,7 +107,8 @@ public:
const std::vector<std::unique_ptr<segment_v2::IndexIterator>>&
cid_to_index_iterators,
const std::vector<ColumnId>& idx_to_cid,
const std::vector<std::unique_ptr<segment_v2::ColumnIterator>>&
column_iterators,
- roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats) override;
+ roaring::Roaring& row_bitmap, segment_v2::AnnIndexStats&
ann_index_stats,
+ AnnRangeSearchEvaluationResult& result) override;
#ifdef BE_TEST
// Test-only setter methods for unit testing
@@ -130,4 +131,4 @@ private:
DataTypePtr _column_data_type; ///< Data type of the column
};
#include "common/compile_check_end.h"
-} // namespace doris
\ No newline at end of file
+} // namespace doris
diff --git a/be/src/storage/segment/segment_iterator.cpp
b/be/src/storage/segment/segment_iterator.cpp
index 3a9c37aafcd..fb336f8b25b 100644
--- a/be/src/storage/segment/segment_iterator.cpp
+++ b/be/src/storage/segment/segment_iterator.cpp
@@ -1240,9 +1240,14 @@ Status SegmentIterator::_apply_index_expr() {
for (const auto& expr_ctx : _common_expr_ctxs_push_down) {
segment_v2::AnnIndexStats ann_index_stats;
size_t origin_rows = _row_bitmap.cardinality();
+ bool ann_range_search_executed = false;
RETURN_IF_ERROR(expr_ctx->evaluate_ann_range_search(
_index_iterators, _schema->column_ids(), _column_iterators,
- _common_expr_to_slotref_map, _row_bitmap, ann_index_stats));
+ _common_expr_to_slotref_map, _row_bitmap, ann_index_stats,
+ &ann_range_search_executed));
+ if (ann_range_search_executed) {
+ _opts.stats->ann_index_range_search_cnt++;
+ }
_opts.stats->rows_ann_index_range_filtered += (origin_rows -
_row_bitmap.cardinality());
_opts.stats->ann_index_load_ns +=
ann_index_stats.load_index_costs_ns.value();
_opts.stats->ann_index_range_search_ns +=
ann_index_stats.search_costs_ns.value();
@@ -1258,14 +1263,6 @@ Status SegmentIterator::_apply_index_expr() {
_opts.stats->ann_fall_back_brute_force_cnt +=
ann_index_stats.fall_back_brute_force_cnt;
}
- for (auto it = _common_expr_ctxs_push_down.begin(); it !=
_common_expr_ctxs_push_down.end();) {
- if ((*it)->root()->ann_range_search_executedd()) {
- _opts.stats->ann_index_range_search_cnt++;
- it = _common_expr_ctxs_push_down.erase(it);
- } else {
- ++it;
- }
- }
return Status::OK();
}
diff --git a/be/test/storage/index/ann/ann_range_search_test.cpp
b/be/test/storage/index/ann/ann_range_search_test.cpp
index 0cbf9ee6e9e..400e822695c 100644
--- a/be/test/storage/index/ann/ann_range_search_test.cpp
+++ b/be/test/storage/index/ann/ann_range_search_test.cpp
@@ -23,7 +23,9 @@
#include <cassert>
#include <cstdint>
#include <iostream>
+#include <map>
#include <memory>
+#include <unordered_map>
#include <vector>
#include "common/object_pool.h"
@@ -54,6 +56,18 @@ const std::string ann_range_search_thrift =
const std::string thrift_table_desc =
R"xxx({"1":{"lst":["rec",7,{"1":{"i32":0},"2":{"i32":0},"3":{"rec":{"1":{"lst":["rec",1,{"1":{"i32":0},"2":{"rec":{"1":{"i32":5},"5":{"i32":0}}}}]},"3":{"i64":-1}}},"4":{"i32":-1},"5":{"i32":-1},"6":{"i32":0},"7":{"i32":-1},"8":{"str":"id"},"9":{"i32":0},"10":{"tf":1},"11":{"i32":0},"12":{"tf":1},"13":{"tf":1},"14":{"tf":0},"17":{"i32":5}},{"1":{"i32":1},"2":{"i32":0},"3":{"rec":{"1":{"lst":["rec",2,{"1":{"i32":1},"4":{"tf":1},"5":{"lst":["tf",1,1]}},{"1":{"i32":0},"2":{"rec":{"1
[...]
+static std::shared_ptr<IndexExecContext> create_index_context(
+ const std::vector<ColumnId>& col_ids,
+ const std::vector<std::unique_ptr<segment_v2::IndexIterator>>&
index_iterators,
+ std::vector<IndexFieldNameAndTypePair>& storage_name_and_type,
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>&
+ common_expr_index_status) {
+ segment_v2::ColumnIteratorOptions column_iter_opts;
+ return std::make_shared<IndexExecContext>(col_ids, index_iterators,
storage_name_and_type,
+ common_expr_index_status,
nullptr, nullptr,
+ column_iter_opts);
+}
+
TEST_F(VectorSearchTest, TestPrepareAnnRangeSearch) {
TExpr texpr = read_from_json<TExpr>(ann_range_search_thrift);
// std::cout << "range_search thrift:\n" <<
apache::thrift::ThriftDebugString(texpr) << std::endl;
@@ -166,11 +180,17 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch) {
segment_v2::AnnIndexStats stats;
std::unordered_map<VExprContext*, std::unordered_map<ColumnId, VExpr*>>
common_expr_to_slotref_map;
+ std::vector<IndexFieldNameAndTypePair> storage_name_and_type(4);
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>
common_expr_index_status;
+ range_search_ctx->set_index_context(create_index_context(
+ idx_to_cid, cid_to_index_iterators, storage_name_and_type,
common_expr_index_status));
+ bool ann_range_search_executed = false;
ASSERT_TRUE(range_search_ctx
->evaluate_ann_range_search(cid_to_index_iterators,
idx_to_cid,
column_iterators,
common_expr_to_slotref_map,
- row_bitmap, stats)
+ row_bitmap, stats,
&ann_range_search_executed)
.ok());
+ EXPECT_TRUE(ann_range_search_executed);
doris::segment_v2::VirtualColumnIterator* virtual_column_iter =
dynamic_cast<doris::segment_v2::VirtualColumnIterator*>(column_iterators[3].get());
@@ -263,11 +283,17 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch2) {
segment_v2::AnnIndexStats stats;
std::unordered_map<VExprContext*, std::unordered_map<ColumnId, VExpr*>>
common_expr_to_slotref_map;
+ std::vector<IndexFieldNameAndTypePair> storage_name_and_type(4);
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>
common_expr_index_status;
+ range_search_ctx->set_index_context(create_index_context(
+ idx_to_cid, cid_to_index_iterators, storage_name_and_type,
common_expr_index_status));
+ bool ann_range_search_executed = false;
ASSERT_TRUE(range_search_ctx
->evaluate_ann_range_search(cid_to_index_iterators,
idx_to_cid,
column_iterators,
common_expr_to_slotref_map,
- row_bitmap, stats)
+ row_bitmap, stats,
&ann_range_search_executed)
.ok());
+ EXPECT_TRUE(ann_range_search_executed);
doris::segment_v2::VirtualColumnIterator* virtual_column_iter =
dynamic_cast<doris::segment_v2::VirtualColumnIterator*>(column_iterators[3].get());
@@ -284,6 +310,184 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch2) {
EXPECT_EQ(get_row_id_to_idx.size(), 10);
}
+TEST_F(VectorSearchTest,
TestEvaluateAnnRangeSearchStateDoesNotLeakAcrossClones) {
+ TExpr texpr = read_from_json<TExpr>(ann_range_search_thrift);
+ TDescriptorTable table1 =
read_from_json<TDescriptorTable>(thrift_table_desc);
+ std::unique_ptr<doris::ObjectPool> pool =
std::make_unique<doris::ObjectPool>();
+ auto desc_tbl = std::make_unique<DescriptorTbl>();
+ DescriptorTbl* desc_tbl_ptr = desc_tbl.get();
+ ASSERT_TRUE(DescriptorTbl::create(pool.get(), table1,
&(desc_tbl_ptr)).ok());
+ RowDescriptor row_desc = RowDescriptor(*desc_tbl_ptr, {0}, {false});
+ std::unique_ptr<doris::RuntimeState> state =
std::make_unique<doris::RuntimeState>();
+ state->set_desc_tbl(desc_tbl_ptr);
+
+ VExprContextSPtr range_search_ctx;
+ ASSERT_TRUE(VExpr::create_expr_tree(texpr, range_search_ctx).ok());
+ ASSERT_TRUE(range_search_ctx->prepare(state.get(), row_desc).ok());
+ ASSERT_TRUE(range_search_ctx->open(state.get()).ok());
+ doris::VectorSearchUserParams user_params;
+ range_search_ctx->prepare_ann_range_search(user_params);
+
+ VExprContextSPtr segment_with_ann_ctx;
+ ASSERT_TRUE(range_search_ctx->clone(state.get(),
segment_with_ann_ctx).ok());
+ VExprContextSPtr segment_without_ann_ctx;
+ ASSERT_TRUE(range_search_ctx->clone(state.get(),
segment_without_ann_ctx).ok());
+ ASSERT_EQ(segment_with_ann_ctx->root().get(),
segment_without_ann_ctx->root().get());
+
+ std::vector<ColumnId> idx_to_cid = {0, 1, 2, 3};
+ std::vector<std::unique_ptr<segment_v2::IndexIterator>>
ann_index_iterators(4);
+ ann_index_iterators[1] =
std::make_unique<doris::vector_search_utils::MockAnnIndexIterator>();
+ auto* mock_ann_index_iter =
dynamic_cast<doris::vector_search_utils::MockAnnIndexIterator*>(
+ ann_index_iterators[1].get());
+ std::map<std::string, std::string> properties;
+ properties["index_type"] = "hnsw";
+ properties["metric_type"] = "l2_distance";
+ properties["dim"] = "8";
+ auto pair = vector_search_utils::create_tmp_ann_index_reader(properties);
+ mock_ann_index_iter->_ann_reader = pair.second;
+
+ std::vector<std::unique_ptr<segment_v2::ColumnIterator>>
ann_column_iterators(4);
+ ann_column_iterators[3] =
std::make_unique<doris::segment_v2::VirtualColumnIterator>();
+ std::vector<IndexFieldNameAndTypePair> ann_storage_name_and_type(4);
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>
ann_index_status;
+ segment_with_ann_ctx->set_index_context(create_index_context(
+ idx_to_cid, ann_index_iterators, ann_storage_name_and_type,
ann_index_status));
+
+ EXPECT_CALL(*mock_ann_index_iter, range_search(testing::_, testing::_,
testing::_, testing::_))
+ .WillOnce(testing::Invoke([](const
doris::segment_v2::AnnRangeSearchParams& params,
+ const doris::VectorSearchUserParams&
custom_params,
+
doris::segment_v2::AnnRangeSearchResult* result,
+ doris::segment_v2::AnnIndexStats*
stats) {
+ result->roaring = std::make_shared<roaring::Roaring>();
+ result->roaring->add(1);
+ result->roaring->add(3);
+ result->row_ids = nullptr;
+ result->distance = nullptr;
+ return Status::OK();
+ }));
+
+ roaring::Roaring ann_row_bitmap;
+ segment_v2::AnnIndexStats ann_stats;
+ std::unordered_map<VExprContext*, std::unordered_map<ColumnId, VExpr*>>
+ common_expr_to_slotref_map;
+ bool ann_range_search_executed = false;
+ ASSERT_TRUE(segment_with_ann_ctx
+ ->evaluate_ann_range_search(ann_index_iterators,
idx_to_cid,
+ ann_column_iterators,
+
common_expr_to_slotref_map, ann_row_bitmap,
+ ann_stats,
&ann_range_search_executed)
+ .ok());
+ EXPECT_TRUE(ann_range_search_executed);
+ const auto* ann_result =
segment_with_ann_ctx->get_index_context()->get_index_result_for_expr(
+ segment_with_ann_ctx->root().get());
+ ASSERT_NE(ann_result, nullptr);
+ ASSERT_NE(ann_result->get_data_bitmap(), nullptr);
+ EXPECT_EQ(ann_result->get_data_bitmap()->cardinality(), 2);
+
+ std::vector<std::unique_ptr<segment_v2::IndexIterator>>
no_ann_index_iterators(4);
+ std::vector<std::unique_ptr<segment_v2::ColumnIterator>>
no_ann_column_iterators(4);
+ no_ann_column_iterators[3] =
std::make_unique<doris::segment_v2::VirtualColumnIterator>();
+ std::vector<IndexFieldNameAndTypePair> no_ann_storage_name_and_type(4);
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>
no_ann_index_status;
+ segment_without_ann_ctx->set_index_context(create_index_context(
+ idx_to_cid, no_ann_index_iterators, no_ann_storage_name_and_type,
no_ann_index_status));
+
+ roaring::Roaring no_ann_row_bitmap;
+ segment_v2::AnnIndexStats no_ann_stats;
+ bool no_ann_range_search_executed = true;
+ ASSERT_TRUE(segment_without_ann_ctx
+ ->evaluate_ann_range_search(no_ann_index_iterators,
idx_to_cid,
+ no_ann_column_iterators,
+
common_expr_to_slotref_map, no_ann_row_bitmap,
+ no_ann_stats,
&no_ann_range_search_executed)
+ .ok());
+ EXPECT_FALSE(no_ann_range_search_executed);
+
EXPECT_FALSE(segment_without_ann_ctx->get_index_context()->has_index_result_for_expr(
+ segment_without_ann_ctx->root().get()));
+}
+
+TEST_F(VectorSearchTest,
TestEvaluateAnnRangeSearchUsesSourceColumnIndexForSlotMap) {
+ TExpr texpr = read_from_json<TExpr>(ann_range_search_thrift);
+ TExprNode& opNode = texpr.nodes[0];
+ opNode.opcode = TExprOpcode::LT;
+ opNode.fn.name.function_name = doris::NameLess::name;
+ TDescriptorTable table1 =
read_from_json<TDescriptorTable>(thrift_table_desc);
+ std::unique_ptr<doris::ObjectPool> pool =
std::make_unique<doris::ObjectPool>();
+ auto desc_tbl = std::make_unique<DescriptorTbl>();
+ DescriptorTbl* desc_tbl_ptr = desc_tbl.get();
+ ASSERT_TRUE(DescriptorTbl::create(pool.get(), table1,
&(desc_tbl_ptr)).ok());
+ RowDescriptor row_desc = RowDescriptor(*desc_tbl_ptr, {0}, {false});
+ std::unique_ptr<doris::RuntimeState> state =
std::make_unique<doris::RuntimeState>();
+ state->set_desc_tbl(desc_tbl_ptr);
+
+ VExprContextSPtr range_search_ctx;
+ ASSERT_TRUE(VExpr::create_expr_tree(texpr, range_search_ctx).ok());
+ ASSERT_TRUE(range_search_ctx->prepare(state.get(), row_desc).ok());
+ ASSERT_TRUE(range_search_ctx->open(state.get()).ok());
+ doris::VectorSearchUserParams user_params;
+ range_search_ctx->prepare_ann_range_search(user_params);
+ ASSERT_EQ(range_search_ctx->_ann_range_search_runtime.src_col_idx, 1);
+ ASSERT_EQ(range_search_ctx->_ann_range_search_runtime.dst_col_idx, 3);
+
+ std::vector<ColumnId> idx_to_cid = {0, 5, 6, 7};
+ std::vector<std::unique_ptr<segment_v2::IndexIterator>>
cid_to_index_iterators(8);
+ cid_to_index_iterators[5] =
+
std::make_unique<doris::vector_search_utils::MockAnnIndexIterator>();
+ auto* mock_ann_index_iter =
dynamic_cast<doris::vector_search_utils::MockAnnIndexIterator*>(
+ cid_to_index_iterators[5].get());
+ std::map<std::string, std::string> properties;
+ properties["index_type"] = "hnsw";
+ properties["metric_type"] = "l2_distance";
+ properties["dim"] = "8";
+ auto pair = vector_search_utils::create_tmp_ann_index_reader(properties);
+ mock_ann_index_iter->_ann_reader = pair.second;
+
+ std::vector<std::unique_ptr<segment_v2::ColumnIterator>>
column_iterators(8);
+ column_iterators[7] =
std::make_unique<doris::segment_v2::VirtualColumnIterator>();
+ std::vector<IndexFieldNameAndTypePair> storage_name_and_type(8);
+ std::unordered_map<ColumnId, std::unordered_map<const VExpr*, bool>>
common_expr_index_status;
+ common_expr_index_status[5][range_search_ctx->root().get()] = false;
+ range_search_ctx->set_index_context(create_index_context(
+ idx_to_cid, cid_to_index_iterators, storage_name_and_type,
common_expr_index_status));
+
+ std::unordered_map<VExprContext*, std::unordered_map<ColumnId, VExpr*>>
+ common_expr_to_slotref_map;
+ common_expr_to_slotref_map[range_search_ctx.get()][1] =
range_search_ctx->root().get();
+
+ EXPECT_CALL(*mock_ann_index_iter, range_search(testing::_, testing::_,
testing::_, testing::_))
+ .WillOnce(testing::Invoke([](const
doris::segment_v2::AnnRangeSearchParams& params,
+ const doris::VectorSearchUserParams&
custom_params,
+
doris::segment_v2::AnnRangeSearchResult* result,
+ doris::segment_v2::AnnIndexStats*
stats) {
+ constexpr size_t num_results = 3;
+ result->roaring = std::make_shared<roaring::Roaring>();
+ result->row_ids = std::make_shared<std::vector<uint64_t>>();
+ result->distance = std::shared_ptr<float[]>(new
float[num_results]);
+ for (size_t i = 0; i < num_results; ++i) {
+ result->roaring->add(i * 2);
+ result->row_ids->push_back(i * 2);
+ result->distance[i] = static_cast<float>(i);
+ }
+ return Status::OK();
+ }));
+
+ roaring::Roaring row_bitmap;
+ segment_v2::AnnIndexStats stats;
+ bool ann_range_search_executed = false;
+ ASSERT_TRUE(range_search_ctx
+ ->evaluate_ann_range_search(cid_to_index_iterators,
idx_to_cid,
+ column_iterators,
common_expr_to_slotref_map,
+ row_bitmap, stats,
&ann_range_search_executed)
+ .ok());
+ EXPECT_TRUE(ann_range_search_executed);
+ EXPECT_TRUE(common_expr_index_status[5][range_search_ctx->root().get()]);
+ const auto* result =
range_search_ctx->get_index_context()->get_index_result_for_expr(
+ range_search_ctx->root().get());
+ ASSERT_NE(result, nullptr);
+ ASSERT_NE(result->get_data_bitmap(), nullptr);
+ EXPECT_EQ(result->get_data_bitmap()->cardinality(), 3);
+}
+
TEST_F(VectorSearchTest, TestRangeSearchRuntimeInfoToString) {
// Test default constructor
doris::segment_v2::AnnRangeSearchRuntime runtime_info;
@@ -664,7 +868,7 @@ TEST_F(VectorSearchTest,
TestEvaluateAnnRangeSearch_DimensionMismatch) {
auto st = range_search_ctx->evaluate_ann_range_search(
cid_to_index_iterators, idx_to_cid, column_iterators,
common_expr_to_slotref_map,
- row_bitmap, stats);
+ row_bitmap, stats, nullptr);
EXPECT_FALSE(st.ok());
EXPECT_TRUE(st.is<doris::ErrorCode::INVALID_ARGUMENT>());
}
diff --git
a/regression-test/suites/ann_index_p0/ann_range_search_pushdown_regression.groovy
b/regression-test/suites/ann_index_p0/ann_range_search_pushdown_regression.groovy
new file mode 100644
index 00000000000..c3122b9d1e6
--- /dev/null
+++
b/regression-test/suites/ann_index_p0/ann_range_search_pushdown_regression.groovy
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import groovy.json.JsonSlurper
+
+def getProfileList = {
+ def dst = "http://" + context.config.feHttpAddress
+ def conn = new URL(dst + "/rest/v1/query_profile").openConnection()
+ conn.setRequestMethod("GET")
+ def encoding =
Base64.getEncoder().encodeToString((context.config.feHttpUser + ":" +
+ (context.config.feHttpPassword == null ? "" :
context.config.feHttpPassword))
+ .getBytes("UTF-8"))
+ conn.setRequestProperty("Authorization", "Basic ${encoding}")
+ return conn.getInputStream().getText()
+}
+
+def getProfile = { id ->
+ def dst = "http://" + context.config.feHttpAddress
+ def conn = new URL(dst +
"/api/profile/text/?query_id=$id").openConnection()
+ conn.setRequestMethod("GET")
+ def encoding =
Base64.getEncoder().encodeToString((context.config.feHttpUser + ":" +
+ (context.config.feHttpPassword == null ? "" :
context.config.feHttpPassword))
+ .getBytes("UTF-8"))
+ conn.setRequestProperty("Authorization", "Basic ${encoding}")
+ return conn.getInputStream().getText()
+}
+
+def extractCounterValue = { String profileText, String counterName ->
+ for (def line : profileText.split("\n")) {
+ if (line.contains(counterName + ":")) {
+ def m = (line =~
/${java.util.regex.Pattern.quote(counterName)}:\s*([0-9]+(?:\.[0-9]+)?)/)
+ if (m.find()) {
+ return m.group(1)
+ }
+ }
+ }
+ return null
+}
+
+suite("ann_range_search_pushdown_regression", "nonConcurrent") {
+ def getProfileWithToken = { token ->
+ String profileId = ""
+ int attempts = 0
+ while (attempts < 10 && (profileId == null || profileId == "")) {
+ List profileData = new
JsonSlurper().parseText(getProfileList()).data.rows
+ for (def profileItem in profileData) {
+ if (profileItem["Sql Statement"].toString().contains(token)) {
+ profileId = profileItem["Profile ID"].toString()
+ break
+ }
+ }
+ if (profileId == null || profileId == "") {
+ Thread.sleep(300)
+ }
+ attempts++
+ }
+ assertTrue(profileId != null && profileId != "")
+ Thread.sleep(800)
+ return getProfile(profileId).toString()
+ }
+
+ sql "unset variable all;"
+ sql "set enable_common_expr_pushdown=true;"
+ sql "set experimental_enable_virtual_slot_for_cse=true;"
+ sql "set enable_no_need_read_data_opt=true;"
+ sql "set enable_profile=true;"
+ sql "set profile_level=2;"
+ sql "set parallel_pipeline_task_num=1;"
+ sql "set enable_sql_cache=false;"
+ sql "set enable_condition_cache=false;"
+
+ // Case 1: one rowset has an IVF ANN index, while the surrounding small
+ // rowsets skip ANN index building because they have fewer rows than nlist.
+ // This creates a single scan with mixed indexed/non-indexed segments. The
+ // ANN range search execution state must be per segment instead of stored
on
+ // the shared expression root.
+ sql "drop table if exists ann_range_mixed_segment_index"
+ sql """
+ create table ann_range_mixed_segment_index (
+ id int not null,
+ embedding array<float> not null,
+ index idx_embedding(`embedding`) using ann properties(
+ "index_type"="ivf",
+ "metric_type"="l2_distance",
+ "dim"="3",
+ "nlist"="2"
+ )
+ ) duplicate key(id)
+ distributed by hash(id) buckets 1
+ properties(
+ "replication_num"="1",
+ "disable_auto_compaction"="true"
+ );
+ """
+
+ sql "set ivf_nprobe=2;"
+ sql """
+ insert into ann_range_mixed_segment_index values
+ (100, [10.0, 10.0, 10.0]);
+ """
+ sql """
+ insert into ann_range_mixed_segment_index values
+ (1, [0.0, 0.0, 0.0]),
+ (2, [0.1, 0.0, 0.0]),
+ (3, [0.2, 0.0, 0.0]),
+ (4, [0.3, 0.0, 0.0]);
+ """
+ sql """
+ insert into ann_range_mixed_segment_index values
+ (101, [10.0, 10.0, 10.0]);
+ """
+
+ def tokenMixed = UUID.randomUUID().toString()
+ def mixedRows = sql """
+ select id, "${tokenMixed}"
+ from ann_range_mixed_segment_index
+ where l2_distance_approximate(embedding, [0.0, 0.0, 0.0]) < 1.0
+ order by id;
+ """
+ assertEquals([1, 2, 3, 4], mixedRows.collect { it[0] })
+
+ def mixedProfile = getProfileWithToken(tokenMixed)
+ def rangeSearchCnt = extractCounterValue(mixedProfile,
"AnnIndexRangeSearchCnt")
+ logger.info("Mixed indexed/non-indexed segment
AnnIndexRangeSearchCnt=${rangeSearchCnt}")
+ assertEquals("1", rangeSearchCnt)
+
+}
diff --git
a/regression-test/suites/ann_index_p0/ann_range_search_source_index_status_regression.groovy
b/regression-test/suites/ann_index_p0/ann_range_search_source_index_status_regression.groovy
new file mode 100644
index 00000000000..f4a881e75ce
--- /dev/null
+++
b/regression-test/suites/ann_index_p0/ann_range_search_source_index_status_regression.groovy
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("ann_range_search_source_index_status_regression", "nonConcurrent") {
+ sql "unset variable all;"
+ sql "set enable_common_expr_pushdown=true;"
+ sql "set experimental_enable_virtual_slot_for_cse=true;"
+ sql "set enable_no_need_read_data_opt=true;"
+ sql "set parallel_pipeline_task_num=1;"
+ sql "set enable_sql_cache=false;"
+ sql "set enable_condition_cache=false;"
+
+ // The source column's index in the scan block differs from its storage
+ // ColumnId. ANN range search should mark the common expression status by
+ // scan-block source column index, so the embedding column can be skipped
+ // when it is only used by the satisfied ANN predicate.
+ sql "drop table if exists ann_range_source_index_status"
+ sql """
+ create table ann_range_source_index_status (
+ id int not null,
+ pad_int int not null,
+ pad_text string not null,
+ embedding array<float> not null,
+ value int not null,
+ index idx_embedding(`embedding`) using ann properties(
+ "index_type"="hnsw",
+ "metric_type"="l2_distance",
+ "dim"="3"
+ )
+ ) duplicate key(id)
+ distributed by hash(id) buckets 1
+ properties("replication_num"="1");
+ """
+
+ sql """
+ insert into ann_range_source_index_status values
+ (1, 10, 'a', [0.0, 0.0, 0.0], 100),
+ (2, 20, 'b', [0.1, 0.0, 0.0], 200),
+ (3, 30, 'c', [0.2, 0.0, 0.0], 300),
+ (4, 40, 'd', [0.3, 0.0, 0.0], 400),
+ (5, 50, 'e', [0.4, 0.0, 0.0], 500),
+ (6, 60, 'f', [0.5, 0.0, 0.0], 600),
+ (7, 70, 'g', [0.6, 0.0, 0.0], 700),
+ (8, 80, 'h', [0.7, 0.0, 0.0], 800),
+ (9, 90, 'i', [0.8, 0.0, 0.0], 900),
+ (10, 100, 'j', [0.9, 0.0, 0.0], 1000);
+ """
+
+ try {
+ GetDebugPoint().enableDebugPointForAllBEs(
+ "segment_iterator._read_columns_by_index", [column_name:
"embedding"])
+ def indexOnlyRows = sql """
+ select id
+ from ann_range_source_index_status
+ where l2_distance_approximate(embedding, [0.0, 0.0, 0.0]) < 1.0
+ order by id;
+ """
+ assertEquals([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], indexOnlyRows.collect {
it[0] })
+ } finally {
+
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
+ }
+
+ def readEmbeddingRows = sql """
+ select id, embedding
+ from ann_range_source_index_status
+ where l2_distance_approximate(embedding, [0.0, 0.0, 0.0]) < 1.0
+ order by id;
+ """
+ assertEquals([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], readEmbeddingRows.collect {
it[0] })
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]