(doris) 02/03: [opt](olap) Optimize the performance of StructFileColumnIterator::read_by_rowids in scenarios where the rowids are continuous (#58851)

mrhhsg Mon, 22 Dec 2025 22:32:43 -0800

This is an automated email from the ASF dual-hosted git repository.

mrhhsg pushed a commit to branch cherry-pick-nested_column_prune_4.0
in repository https://gitbox.apache.org/repos/asf/doris.git


commit 4e760a673eacb855f87d86314a32ea218a794310
Author: Jerry Hu <[email protected]>
AuthorDate: Fri Dec 12 10:09:32 2025 +0800

    [opt](olap) Optimize the performance of 
StructFileColumnIterator::read_by_rowids in scenarios where the rowids are 
continuous (#58851)
    
    ### What problem does this PR solve?
    
    Avoid seeking and reading row by row.
    
    Issue Number: close #xxx
    
    Related PR: #xxx
    
    Problem Summary:
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/olap/rowset/segment_v2/column_reader.cpp    | 53 ++++++++++++++++-----
 .../complex_types/test_pruned_columns.out          | 55 +++++++++++++++++++++-
 .../complex_types/test_pruned_columns.groovy       | 25 ++++++++--
 3 files changed, 115 insertions(+), 18 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 4c22f0c60a7..ca2f0e47705 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -1026,7 +1026,7 @@ Status MapFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr
         return Status::OK();
     }
 
-    auto& column_map = assert_cast<vectorized::ColumnMap&>(
+    auto& column_map = assert_cast<vectorized::ColumnMap&, 
TypeCheckOnRelease::DISABLE>(
             dst->is_nullable() ? 
static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
                                : *dst);
     auto column_offsets_ptr = column_map.get_offsets_column().assume_mutable();
@@ -1070,7 +1070,8 @@ Status MapFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr
             RETURN_IF_ERROR(
                     _null_iterator->next_batch(&num_read, null_map_ptr, 
&null_signs_has_null));
         } else {
-            auto& null_map = 
assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr);
+            auto& null_map = assert_cast<vectorized::ColumnUInt8&, 
TypeCheckOnRelease::DISABLE>(
+                    *null_map_ptr);
             null_map.insert_many_vals(0, num_read);
         }
         DCHECK(num_read == *n);
@@ -1155,7 +1156,8 @@ Status MapFileColumnIterator::read_by_rowids(const 
rowid_t* rowids, const size_t
             ordinal_t ns = 0;
             RETURN_IF_ERROR(_offsets_iterator->_peek_one_offset(&ns));
             // overwrite with sentinel
-            
assert_cast<vectorized::ColumnOffset64&>(*next_starts_col).get_data()[i] = ns;
+            assert_cast<vectorized::ColumnOffset64&, 
TypeCheckOnRelease::DISABLE>(*next_starts_col)
+                    .get_data()[i] = ns;
         }
     }
 
@@ -1320,7 +1322,7 @@ Status StructFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumn
         return Status::OK();
     }
 
-    auto& column_struct = assert_cast<vectorized::ColumnStruct&>(
+    auto& column_struct = assert_cast<vectorized::ColumnStruct&, 
TypeCheckOnRelease::DISABLE>(
             dst->is_nullable() ? 
static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
                                : *dst);
     for (size_t i = 0; i < column_struct.tuple_size(); i++) {
@@ -1346,7 +1348,8 @@ Status StructFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumn
             RETURN_IF_ERROR(
                     _null_iterator->next_batch(&num_read, null_map_ptr, 
&null_signs_has_null));
         } else {
-            auto& null_map = 
assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr);
+            auto& null_map = assert_cast<vectorized::ColumnUInt8&, 
TypeCheckOnRelease::DISABLE>(
+                    *null_map_ptr);
             null_map.insert_many_vals(0, num_read);
         }
         DCHECK(num_read == *n);
@@ -1378,12 +1381,33 @@ Status StructFileColumnIterator::read_by_rowids(const 
rowid_t* rowids, const siz
         return Status::OK();
     }
 
-    for (size_t i = 0; i < count; ++i) {
-        RETURN_IF_ERROR(seek_to_ordinal(rowids[i]));
-        size_t num_read = 1;
+    if (count == 0) {
+        return Status::OK();
+    }
+
+    size_t this_run = 1;
+    auto start_idx = rowids[0];
+    auto last_idx = rowids[0];
+    for (size_t i = 1; i < count; ++i) {
+        if (last_idx == rowids[i] - 1) {
+            last_idx = rowids[i];
+            this_run++;
+            continue;
+        }
+        RETURN_IF_ERROR(seek_to_ordinal(start_idx));
+        size_t num_read = this_run;
         RETURN_IF_ERROR(next_batch(&num_read, dst, nullptr));
-        DCHECK(num_read == 1);
+        DCHECK_EQ(num_read, this_run);
+
+        start_idx = rowids[i];
+        last_idx = rowids[i];
+        this_run = 1;
     }
+
+    RETURN_IF_ERROR(seek_to_ordinal(start_idx));
+    size_t num_read = this_run;
+    RETURN_IF_ERROR(next_batch(&num_read, dst, nullptr));
+    DCHECK_EQ(num_read, this_run);
     return Status::OK();
 }
 
@@ -1485,8 +1509,9 @@ Status 
OffsetFileColumnIterator::_peek_one_offset(ordinal_t* offset) {
         _peek_tmp_col->clear();
         RETURN_IF_ERROR(offset_page_decoder->peek_next_batch(&n, 
_peek_tmp_col)); // not null
         DCHECK(_peek_tmp_col->size() == 1);
-        *offset =
-                assert_cast<const 
vectorized::ColumnOffset64*>(_peek_tmp_col.get())->get_element(0);
+        *offset = assert_cast<const vectorized::ColumnOffset64*, 
TypeCheckOnRelease::DISABLE>(
+                          _peek_tmp_col.get())
+                          ->get_element(0);
     } else {
         *offset = 
_offset_iterator->get_current_page()->next_array_item_ordinal;
     }
@@ -1617,7 +1642,8 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnP
             RETURN_IF_ERROR(
                     _null_iterator->next_batch(&num_read, null_map_ptr, 
&null_signs_has_null));
         } else {
-            auto& null_map = 
assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr);
+            auto& null_map = assert_cast<vectorized::ColumnUInt8&, 
TypeCheckOnRelease::DISABLE>(
+                    *null_map_ptr);
             null_map.insert_many_vals(0, num_read);
         }
         DCHECK(num_read == *n);
@@ -2203,7 +2229,8 @@ void 
DefaultValueColumnIterator::_insert_many_default(vectorized::MutableColumnP
 
 Status RowIdColumnIteratorV2::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
                                          bool* has_null) {
-    auto* string_column = assert_cast<vectorized::ColumnString*>(dst.get());
+    auto* string_column =
+            assert_cast<vectorized::ColumnString*, 
TypeCheckOnRelease::DISABLE>(dst.get());
 
     for (uint32_t i = 0; i < *n; ++i) {
         uint32_t row_id = _current_rowid + i;
diff --git 
a/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out 
b/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out
index 86728bafd1c..b3312aa670c 100644
--- a/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out
+++ b/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out
@@ -1,7 +1,18 @@
 -- This file is automatically generated. You should know what you did if you 
want to edit this
 -- !sql --
-1      {"city":"beijing", "data":[{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}]}
-2      {"city":"shanghai", "data":[{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}]}
+1      {"city":"beijing", "data":[{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}], 
"value":1}
+2      {"city":"shanghai", "data":[{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}], 
"value":2}
+3      {"city":"guangzhou", "data":[{1:{"a":90, "b":60}, 2:{"a":110, 
"b":40}}], "value":3}
+4      {"city":"shenzhen", "data":[{2:{"a":130, "b":20}, 1:{"a":150, 
"b":40}}], "value":4}
+5      {"city":"hangzhou", "data":[{1:{"a":170, "b":80}, 2:{"a":190, 
"b":40}}], "value":5}
+6      {"city":"nanjing", "data":[{2:{"a":210, "b":60}, 1:{"a":230, "b":40}}], 
"value":6}
+7      {"city":"tianjin", "data":[{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}], 
"value":7}
+8      {"city":"chongqing", "data":[{2:{"a":290, "b":80}, 1:{"a":310, 
"b":40}}], "value":8}
+9      {"city":"wuhan", "data":[{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}], 
"value":9}
+10     {"city":"xian", "data":[{2:{"a":370, "b":20}, 1:{"a":390, "b":40}}], 
"value":10}
+11     {"city":"changsha", "data":[{1:{"a":410, "b":80}, 2:{"a":430, 
"b":40}}], "value":11}
+12     {"city":"qingdao", "data":[{2:{"a":450, "b":60}, 1:{"a":470, "b":40}}], 
"value":12}
+13     {"city":"dalian", "data":[{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}], 
"value":13}
 
 -- !sql1 --
 1      [10]
@@ -9,18 +20,58 @@
 -- !sql2 --
 1      beijing
 2      shanghai
+3      guangzhou
+4      shenzhen
+5      hangzhou
+6      nanjing
+7      tianjin
+8      chongqing
+9      wuhan
+10     xian
+11     changsha
+12     qingdao
+13     dalian
 
 -- !sql3 --
 1      [{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}]
 2      [{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}]
+3      [{1:{"a":90, "b":60}, 2:{"a":110, "b":40}}]
+4      [{2:{"a":130, "b":20}, 1:{"a":150, "b":40}}]
+5      [{1:{"a":170, "b":80}, 2:{"a":190, "b":40}}]
+6      [{2:{"a":210, "b":60}, 1:{"a":230, "b":40}}]
+7      [{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}]
+8      [{2:{"a":290, "b":80}, 1:{"a":310, "b":40}}]
+9      [{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}]
+10     [{2:{"a":370, "b":20}, 1:{"a":390, "b":40}}]
+11     [{1:{"a":410, "b":80}, 2:{"a":430, "b":40}}]
+12     [{2:{"a":450, "b":60}, 1:{"a":470, "b":40}}]
+13     [{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}]
 
 -- !sql4 --
 1      [{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}]
 2      [{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}]
+3      [{1:{"a":90, "b":60}, 2:{"a":110, "b":40}}]
+5      [{1:{"a":170, "b":80}, 2:{"a":190, "b":40}}]
+7      [{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}]
+9      [{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}]
+11     [{1:{"a":410, "b":80}, 2:{"a":430, "b":40}}]
+13     [{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}]
 
 -- !sql5 --
 1      beijing
 2      shanghai
+3      guangzhou
+5      hangzhou
+7      tianjin
+9      wuhan
+11     changsha
+13     dalian
+
+-- !sql5_1 --
+61
+
+-- !sql5_2 --
+61
 
 -- !sql6 --
 2
diff --git 
a/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy 
b/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy
index c2a7e2b7146..a99b7b6da59 100644
--- 
a/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy
+++ 
b/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy
@@ -20,7 +20,7 @@ suite("test_pruned_columns") {
     sql """
         CREATE TABLE `tbl_test_pruned_columns` (
             `id` int NULL,
-            `s` struct<city:text,data:array<map<int,struct<a:int,b:double>>>> 
NULL
+            `s` struct<city:text,data:array<map<int,struct<a:int,b:double>>>, 
value:int> NULL
         ) ENGINE=OLAP
         DUPLICATE KEY(`id`)
         DISTRIBUTED BY RANDOM BUCKETS AUTO
@@ -31,8 +31,19 @@ suite("test_pruned_columns") {
 
     sql """
         insert into `tbl_test_pruned_columns` values
-            (1, named_struct('city', 'beijing', 'data', array(map(1, 
named_struct('a', 10, 'b', 20.0), 2, named_struct('a', 30, 'b', 40))))),
-            (2, named_struct('city', 'shanghai', 'data', array(map(2, 
named_struct('a', 50, 'b', 40.0), 1, named_struct('a', 70, 'b', 80)))));
+            (1, named_struct('city', 'beijing', 'data', array(map(1, 
named_struct('a', 10, 'b', 20.0), 2, named_struct('a', 30, 'b', 40))), 'value', 
1)),
+            (2, named_struct('city', 'shanghai', 'data', array(map(2, 
named_struct('a', 50, 'b', 40.0), 1, named_struct('a', 70, 'b', 80))), 'value', 
2)),
+            (3, named_struct('city', 'guangzhou', 'data', array(map(1, 
named_struct('a', 90, 'b', 60.0), 2, named_struct('a', 110, 'b', 40))), 
'value', 3)),
+            (4, named_struct('city', 'shenzhen', 'data', array(map(2, 
named_struct('a', 130, 'b', 20.0), 1, named_struct('a', 150, 'b', 40))), 
'value', 4)),
+            (5, named_struct('city', 'hangzhou', 'data', array(map(1, 
named_struct('a', 170, 'b', 80.0), 2, named_struct('a', 190, 'b', 40))), 
'value', 5)),
+            (6, named_struct('city', 'nanjing', 'data', array(map(2, 
named_struct('a', 210, 'b', 60.0), 1, named_struct('a', 230, 'b', 40))), 
'value', 6)),
+            (7, named_struct('city', 'tianjin', 'data', array(map(1, 
named_struct('a', 250, 'b', 20.0), 2, named_struct('a', 270, 'b', 40))), 
'value', 7)),
+            (8, named_struct('city', 'chongqing', 'data', array(map(2, 
named_struct('a', 290, 'b', 80.0), 1, named_struct('a', 310, 'b', 40))), 
'value', 8)),
+            (9, named_struct('city', 'wuhan', 'data', array(map(1, 
named_struct('a', 330, 'b', 60.0), 2, named_struct('a', 350, 'b', 40))), 
'value', 9)),
+            (10, named_struct('city', 'xian', 'data', array(map(2, 
named_struct('a', 370, 'b', 20.0), 1, named_struct('a', 390, 'b', 40))), 
'value', 10)),
+            (11, named_struct('city', 'changsha', 'data', array(map(1, 
named_struct('a', 410, 'b', 80.0), 2, named_struct('a', 430, 'b', 40))), 
'value', 11)),
+            (12, named_struct('city', 'qingdao', 'data', array(map(2, 
named_struct('a', 450, 'b', 60.0), 1, named_struct('a', 470, 'b', 40))), 
'value', 12)),
+            (13, named_struct('city', 'dalian', 'data', array(map(1, 
named_struct('a', 490, 'b', 20.0), 2, named_struct('a', 510, 'b', 40))), 
'value', 13));
     """
 
     qt_sql """
@@ -59,6 +70,14 @@ suite("test_pruned_columns") {
         select id, struct_element(s, 'city') from `tbl_test_pruned_columns` 
where struct_element(struct_element(s, 'data')[1][2], 'b') = 40 order by 1;
     """
 
+    qt_sql5_1 """
+        select /*+ set enable_prune_nested_column = 1; */ sum(s.value) from 
`tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13);
+    """
+
+    qt_sql5_2 """
+        select /*+ set enable_prune_nested_column = 0; */ sum(s.value) from 
`tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13);
+    """
+
     sql """DROP TABLE IF EXISTS `tbl_test_pruned_columns_map`"""
     sql """
         CREATE TABLE `tbl_test_pruned_columns_map` (


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) 02/03: [opt](olap) Optimize the performance of StructFileColumnIterator::read_by_rowids in scenarios where the rowids are continuous (#58851)

Reply via email to