This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d5e95188554 [Bug](scan) Preserve IN_LIST runtime filter predicates 
when key range is a scope range (#62027)
d5e95188554 is described below

commit d5e951885544bcde7a08e49b336395677a304876
Author: Pxl <[email protected]>
AuthorDate: Fri Apr 3 11:48:10 2026 +0800

    [Bug](scan) Preserve IN_LIST runtime filter predicates when key range is a 
scope range (#62027)
    
    This pull request addresses a bug in the OLAP scan operator where
    `IN_LIST` predicates could be incorrectly erased when both `MINMAX` and
    `IN` runtime filters targeted the same key column, and the number of
    `IN` values exceeded the maximum allowed for pushdown. The changes
    ensure that `IN_LIST` predicates are preserved in such cases, preventing
    incorrect query results. Additionally, a regression test is added to
    verify the fix.
    
    **Bug fix in predicate handling:**
    
    * Modified the logic in `_build_key_ranges_and_filters()` within
    `olap_scan_operator.cpp` to ensure that `IN_LIST` predicates are not
    erased when the key range is a scope range (e.g., `>= X AND <= Y`) and
    the `IN` filter's value count exceeds
    `max_pushdown_conditions_per_column`. This preserves filtering semantics
    that are not captured by the scope range.
    
[[1]](diffhunk://#diff-3ddc75656071d9c0e6b0be450e152a1c94559f7e70ea820e7f0c80a7078e3292R972)
    
[[2]](diffhunk://#diff-3ddc75656071d9c0e6b0be450e152a1c94559f7e70ea820e7f0c80a7078e3292R986)
    
[[3]](diffhunk://#diff-3ddc75656071d9c0e6b0be450e152a1c94559f7e70ea820e7f0c80a7078e3292L986-R1013)
    
    * Enhanced the profiling output in `_process_conjuncts()` to accurately
    reflect the set of predicates that will reach the storage layer after
    key range and filter construction. This helps with debugging and
    verification of predicate pushdown.
    
    **Testing and regression coverage:**
    
    * Added a new regression test
    `test_rf_in_list_not_erased_by_scope_range.groovy` to verify that
    `IN_LIST` predicates are not incorrectly erased when both `MINMAX` and
    `IN` filters are present and the `IN` list is too large to be absorbed
    into the key range.
    
    * Added the corresponding expected output file for the new regression
    test.
---
 be/src/exec/operator/olap_scan_operator.cpp        | 44 ++++++++++-
 be/src/exec/operator/scan_operator.cpp             | 41 +++++-----
 be/src/storage/predicate/column_predicate.h        |  3 +-
 be/src/storage/predicate/comparison_predicate.h    |  6 --
 be/src/storage/predicate/in_list_predicate.h       |  7 --
 be/src/storage/predicate/null_predicate.h          |  1 -
 .../test_rf_in_list_not_erased_by_scope_range.out  |  9 +++
 ...est_rf_in_list_not_erased_by_scope_range.groovy | 91 ++++++++++++++++++++++
 8 files changed, 166 insertions(+), 36 deletions(-)

diff --git a/be/src/exec/operator/olap_scan_operator.cpp 
b/be/src/exec/operator/olap_scan_operator.cpp
index 45c4728ba57..ccdd82bc5f1 100644
--- a/be/src/exec/operator/olap_scan_operator.cpp
+++ b/be/src/exec/operator/olap_scan_operator.cpp
@@ -951,6 +951,7 @@ Status OlapScanLocalState::_build_key_ranges_and_filters() {
             const auto& value_range = iter->second;
 
             std::optional<int> key_to_erase;
+            bool is_fixed_value_range = false;
 
             RETURN_IF_ERROR(std::visit(
                     [&](auto&& range) {
@@ -964,6 +965,7 @@ Status OlapScanLocalState::_build_key_ranges_and_filters() {
                                                                &exact_range, 
&eos, &should_break));
                             if (exact_range) {
                                 key_to_erase = iter->first;
+                                is_fixed_value_range = 
range.is_fixed_value_range();
                             }
                         } else {
                             // if exceed max_pushdown_conditions_per_column, 
use whole_value_rang instead
@@ -981,9 +983,49 @@ Status OlapScanLocalState::_build_key_ranges_and_filters() 
{
             if (key_to_erase.has_value()) {
                 _slot_id_to_value_range.erase(*key_to_erase);
 
+                // Determine which predicates are subsumed by the scan key 
range and can
+                // be removed.  The rule depends on the ColumnValueRange type:
+                //
+                //   Fixed value range  →  scan keys are exact point lookups, 
so both
+                //                         comparison (EQ/LT/LE/GT/GE) and 
positive IN_LIST
+                //                         predicates are fully captured and 
can be erased.
+                //
+                //   Scope range        →  scan keys only capture [low, high] 
boundaries,
+                //                         so only comparison predicates are 
subsumed.
+                //                         IN_LIST predicates (whose values 
may NOT have been
+                //                         absorbed into the ColumnValueRange, 
e.g., because
+                //                         the value count exceeded 
max_pushdown_conditions_per_column)
+                //                         must be preserved.
+                //
+                // In either case, predicates with negation semantics 
(effective NE / NOT_IN_LIST)
+                // are never subsumed by scan key ranges and must always be 
preserved.
+                auto can_erase_predicate = [is_fixed_value_range](const 
ColumnPredicate& pred) {
+                    PredicateType pt = pred.type();
+                    bool opposite = pred.opposite();
+
+                    // Effective NE: never subsumed by any scan key range.
+                    if ((pt == PredicateType::NE && !opposite) ||
+                        (pt == PredicateType::EQ && opposite)) {
+                        return false;
+                    }
+                    // Comparison predicates (EQ/LT/LE/GT/GE) or 
IS_NULL/IS_NOT_NULL: subsumed by both
+                    // fixed value and scope ranges.
+                    if (PredicateTypeTraits::is_comparison(pt) || pt == 
PredicateType::IS_NULL ||
+                        pt == PredicateType::IS_NOT_NULL) {
+                        return true;
+                    }
+                    // Effective IN_LIST: only subsumed by fixed value range.
+                    if ((pt == PredicateType::IN_LIST && !opposite) ||
+                        (pt == PredicateType::NOT_IN_LIST && opposite)) {
+                        return is_fixed_value_range;
+                    }
+                    // Everything else (BF, BITMAP, NOT_IN_LIST, etc.): keep.
+                    return false;
+                };
+
                 std::vector<std::shared_ptr<ColumnPredicate>> new_predicates;
                 for (const auto& it : _slot_id_to_predicates[*key_to_erase]) {
-                    if (!it->could_be_erased()) {
+                    if (!can_erase_predicate(*it)) {
                         new_predicates.push_back(it);
                     }
                 }
diff --git a/be/src/exec/operator/scan_operator.cpp 
b/be/src/exec/operator/scan_operator.cpp
index 251e4e437ca..ceb0e1254be 100644
--- a/be/src/exec/operator/scan_operator.cpp
+++ b/be/src/exec/operator/scan_operator.cpp
@@ -153,6 +153,23 @@ Status ScanLocalState<Derived>::init(RuntimeState* state, 
LocalStateInfo& info)
     return Status::OK();
 }
 
+static std::string predicates_to_string(
+        const phmap::flat_hash_map<int, 
std::vector<std::shared_ptr<ColumnPredicate>>>&
+                slot_id_to_predicates) {
+    fmt::memory_buffer debug_string_buffer;
+    for (const auto& [slot_id, predicates] : slot_id_to_predicates) {
+        if (predicates.empty()) {
+            continue;
+        }
+        fmt::format_to(debug_string_buffer, "Slot ID: {}: [", slot_id);
+        for (const auto& predicate : predicates) {
+            fmt::format_to(debug_string_buffer, "{{{}}}, ", 
predicate->debug_string());
+        }
+        fmt::format_to(debug_string_buffer, "] ");
+    }
+    return fmt::to_string(debug_string_buffer);
+}
+
 template <typename Derived>
 Status ScanLocalState<Derived>::open(RuntimeState* state) {
     SCOPED_TIMER(exec_time_counter());
@@ -193,6 +210,11 @@ Status ScanLocalState<Derived>::open(RuntimeState* state) {
 
     RETURN_IF_ERROR(_process_conjuncts(state));
 
+    if (state->enable_profile()) {
+        custom_profile()->add_info_string("PushDownPredicates",
+                                          
predicates_to_string(_slot_id_to_predicates));
+    }
+
     auto status = _eos ? Status::OK() : _prepare_scanners();
     RETURN_IF_ERROR(status);
     if (auto ctx = _scanner_ctx.load()) {
@@ -203,23 +225,6 @@ Status ScanLocalState<Derived>::open(RuntimeState* state) {
     return status;
 }
 
-static std::string predicates_to_string(
-        const phmap::flat_hash_map<int, 
std::vector<std::shared_ptr<ColumnPredicate>>>&
-                slot_id_to_predicates) {
-    fmt::memory_buffer debug_string_buffer;
-    for (const auto& [slot_id, predicates] : slot_id_to_predicates) {
-        if (predicates.empty()) {
-            continue;
-        }
-        fmt::format_to(debug_string_buffer, "Slot ID: {}: [", slot_id);
-        for (const auto& predicate : predicates) {
-            fmt::format_to(debug_string_buffer, "{{{}}}, ", 
predicate->debug_string());
-        }
-        fmt::format_to(debug_string_buffer, "] ");
-    }
-    return fmt::to_string(debug_string_buffer);
-}
-
 static void init_slot_value_range(
         phmap::flat_hash_map<int, ColumnValueRangeType>& 
slot_id_to_value_range,
         SlotDescriptor* slot, const DataTypePtr type_desc) {
@@ -325,8 +330,6 @@ Status 
ScanLocalState<Derived>::_normalize_conjuncts(RuntimeState* state) {
     }
 
     if (state->enable_profile()) {
-        custom_profile()->add_info_string("PushDownPredicates",
-                                          
predicates_to_string(_slot_id_to_predicates));
         std::string message;
         for (auto& conjunct : _conjuncts) {
             if (conjunct->root()) {
diff --git a/be/src/storage/predicate/column_predicate.h 
b/be/src/storage/predicate/column_predicate.h
index 59b6813ad4e..6ac2c45e93f 100644
--- a/be/src/storage/predicate/column_predicate.h
+++ b/be/src/storage/predicate/column_predicate.h
@@ -226,8 +226,7 @@ public:
     }
 
     virtual double get_ignore_threshold() const { return 0; }
-    // If this predicate acts on the key column, this predicate should be 
erased.
-    virtual bool could_be_erased() const { return false; }
+
     // Return the size of value set for IN/NOT IN predicates and 0 for others.
     virtual std::string debug_string() const {
         fmt::memory_buffer debug_string_buffer;
diff --git a/be/src/storage/predicate/comparison_predicate.h 
b/be/src/storage/predicate/comparison_predicate.h
index a76ff865530..1a7885a8fa5 100644
--- a/be/src/storage/predicate/comparison_predicate.h
+++ b/be/src/storage/predicate/comparison_predicate.h
@@ -51,12 +51,6 @@ public:
                        ColumnPredicate::debug_string());
         return fmt::to_string(debug_string_buffer);
     }
-    bool could_be_erased() const override {
-        if ((PT == PredicateType::NE && !_opposite) || (PT == 
PredicateType::EQ && _opposite)) {
-            return false;
-        }
-        return true;
-    }
 
     PredicateType type() const override { return PT; }
 
diff --git a/be/src/storage/predicate/in_list_predicate.h 
b/be/src/storage/predicate/in_list_predicate.h
index 8aec120bd94..3ed8a3d3f74 100644
--- a/be/src/storage/predicate/in_list_predicate.h
+++ b/be/src/storage/predicate/in_list_predicate.h
@@ -147,13 +147,6 @@ public:
 
     PredicateType type() const override { return PT; }
 
-    bool could_be_erased() const override {
-        if ((PT == PredicateType::NOT_IN_LIST && !_opposite) ||
-            (PT == PredicateType::IN_LIST && _opposite)) {
-            return false;
-        }
-        return true;
-    }
     Status evaluate(const IndexFieldNameAndTypePair& name_with_type, 
IndexIterator* iterator,
                     uint32_t num_rows, roaring::Roaring* result) const 
override {
         if (iterator == nullptr) {
diff --git a/be/src/storage/predicate/null_predicate.h 
b/be/src/storage/predicate/null_predicate.h
index 5a9c547a525..8e346208877 100644
--- a/be/src/storage/predicate/null_predicate.h
+++ b/be/src/storage/predicate/null_predicate.h
@@ -58,7 +58,6 @@ public:
                        ColumnPredicate::debug_string(), _is_null);
         return fmt::to_string(debug_string_buffer);
     }
-    bool could_be_erased() const override { return true; }
 
     PredicateType type() const override;
 
diff --git 
a/regression-test/data/correctness_p0/test_rf_in_list_not_erased_by_scope_range.out
 
b/regression-test/data/correctness_p0/test_rf_in_list_not_erased_by_scope_range.out
new file mode 100644
index 00000000000..48792fec92c
--- /dev/null
+++ 
b/regression-test/data/correctness_p0/test_rf_in_list_not_erased_by_scope_range.out
@@ -0,0 +1,9 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !join --
+10     10
+12     12
+2      2
+4      4
+6      6
+8      8
+
diff --git 
a/regression-test/suites/correctness_p0/test_rf_in_list_not_erased_by_scope_range.groovy
 
b/regression-test/suites/correctness_p0/test_rf_in_list_not_erased_by_scope_range.groovy
new file mode 100644
index 00000000000..113524d68dd
--- /dev/null
+++ 
b/regression-test/suites/correctness_p0/test_rf_in_list_not_erased_by_scope_range.groovy
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This test verifies that when both MINMAX and IN runtime filters target the 
same
+// key column, and the IN filter's value count exceeds 
max_pushdown_conditions_per_column,
+// the IN_LIST predicate is NOT incorrectly erased by the key range 
construction logic.
+// Regression test for the bug where _build_key_ranges_and_filters() erased 
IN_LIST
+// predicates when the ColumnValueRange was a scope range (from MINMAX filter).
+suite("test_rf_in_list_not_erased_by_scope_range") {
+    sql "drop table if exists rf_scope_probe;"
+    sql "drop table if exists rf_scope_build;"
+
+    sql """
+        CREATE TABLE rf_scope_probe (
+            k1 BIGINT,
+            v1 INT
+        )
+        DUPLICATE KEY(k1)
+        DISTRIBUTED BY HASH(k1) BUCKETS 1
+        PROPERTIES ("replication_num" = "1");
+    """
+
+    sql """
+        CREATE TABLE rf_scope_build (
+            k1 BIGINT,
+            v1 INT
+        )
+        DUPLICATE KEY(k1)
+        DISTRIBUTED BY HASH(k1) BUCKETS 1
+        PROPERTIES ("replication_num" = "1");
+    """
+
+    // Probe table: insert 20 rows with k1 from 1 to 20.
+    // The build side will only match a subset (k1 in {2,4,6,8,10,12}).
+    // Rows NOT in this subset (k1=1,3,5,7,9,11,13..20) should be filtered out
+    // by the IN_LIST runtime filter.
+    sql """
+        INSERT INTO rf_scope_probe VALUES
+            (1, 1), (2, 2), (3, 3), (4, 4), (5, 5),
+            (6, 6), (7, 7), (8, 8), (9, 9), (10, 10),
+            (11, 11), (12, 12), (13, 13), (14, 14), (15, 15),
+            (16, 16), (17, 17), (18, 18), (19, 19), (20, 20);
+    """
+
+    // Build table: 6 distinct k1 values. This exceeds 
max_pushdown_conditions_per_column=5
+    // so the IN values are NOT added to ColumnValueRange, but the IN_LIST 
predicate is created.
+    // MINMAX range: [2, 12]
+    sql """
+        INSERT INTO rf_scope_build VALUES
+            (2, 100), (4, 200), (6, 300), (8, 400), (10, 500), (12, 600);
+    """
+
+    sql "sync;"
+
+    // Set max_pushdown_conditions_per_column to 5, so the 6 IN values exceed 
it.
+    // This causes IN values to NOT be added to the ColumnValueRange (it stays 
as
+    // a scope range from the MINMAX filter), but the IN_LIST ColumnPredicate 
is still created.
+    sql "set max_pushdown_conditions_per_column = 5;"
+    // Use both IN and MIN_MAX runtime filter types so both are generated on 
the join key.
+    sql "set runtime_filter_type = 'IN_OR_BLOOM_FILTER,MIN_MAX';"
+    sql "set runtime_filter_wait_time_ms = 10000;"
+    sql "set runtime_filter_wait_infinitely = true;"
+    sql "set enable_runtime_filter_prune = false;"
+    sql "set enable_left_semi_direct_return_opt = true;"
+    sql "set parallel_pipeline_task_num = 1;"
+
+    // The join should only return 6 rows (matching k1 in {2,4,6,8,10,12}).
+    // If the IN_LIST predicate is incorrectly erased, the MINMAX scope [2,12]
+    // would let through rows with k1 in {3,5,7,9,11} as well, producing wrong 
results.
+    // We verify correctness by checking the result.
+    order_qt_join """
+        SELECT p.k1, p.v1
+        FROM rf_scope_probe p
+        LEFT SEMI JOIN rf_scope_build b ON p.k1 = b.k1
+        ORDER BY p.k1;
+    """
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to