This is an automated email from the ASF dual-hosted git repository. mrhhsg pushed a commit to branch cherry-pick-nested_column_prune_4.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 4e760a673eacb855f87d86314a32ea218a794310 Author: Jerry Hu <[email protected]> AuthorDate: Fri Dec 12 10:09:32 2025 +0800 [opt](olap) Optimize the performance of StructFileColumnIterator::read_by_rowids in scenarios where the rowids are continuous (#58851) ### What problem does this PR solve? Avoid seeking and reading row by row. Issue Number: close #xxx Related PR: #xxx Problem Summary: ### Release note None ### Check List (For Author) - Test <!-- At least one of them must be included. --> - [ ] Regression test - [ ] Unit Test - [ ] Manual test (add detailed scripts or steps below) - [ ] No need to test or manual test. Explain why: - [ ] This is a refactor/code format and no logic has been changed. - [ ] Previous test can cover this change. - [ ] No code files have been changed. - [ ] Other reason <!-- Add your reason? --> - Behavior changed: - [ ] No. - [ ] Yes. <!-- Explain the behavior change --> - Does this need documentation? - [ ] No. - [ ] Yes. <!-- Add document PR link here. eg: https://github.com/apache/doris-website/pull/1214 --> ### Check List (For Reviewer who merge this PR) - [ ] Confirm the release note - [ ] Confirm test cases - [ ] Confirm document - [ ] Add branch pick label <!-- Add branch pick label that this PR should merge into --> --- be/src/olap/rowset/segment_v2/column_reader.cpp | 53 ++++++++++++++++----- .../complex_types/test_pruned_columns.out | 55 +++++++++++++++++++++- .../complex_types/test_pruned_columns.groovy | 25 ++++++++-- 3 files changed, 115 insertions(+), 18 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 4c22f0c60a7..ca2f0e47705 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -1026,7 +1026,7 @@ Status MapFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr return Status::OK(); } - auto& column_map = assert_cast<vectorized::ColumnMap&>( + auto& column_map = assert_cast<vectorized::ColumnMap&, TypeCheckOnRelease::DISABLE>( dst->is_nullable() ? static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column() : *dst); auto column_offsets_ptr = column_map.get_offsets_column().assume_mutable(); @@ -1070,7 +1070,8 @@ Status MapFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr RETURN_IF_ERROR( _null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); } else { - auto& null_map = assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr); + auto& null_map = assert_cast<vectorized::ColumnUInt8&, TypeCheckOnRelease::DISABLE>( + *null_map_ptr); null_map.insert_many_vals(0, num_read); } DCHECK(num_read == *n); @@ -1155,7 +1156,8 @@ Status MapFileColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t ordinal_t ns = 0; RETURN_IF_ERROR(_offsets_iterator->_peek_one_offset(&ns)); // overwrite with sentinel - assert_cast<vectorized::ColumnOffset64&>(*next_starts_col).get_data()[i] = ns; + assert_cast<vectorized::ColumnOffset64&, TypeCheckOnRelease::DISABLE>(*next_starts_col) + .get_data()[i] = ns; } } @@ -1320,7 +1322,7 @@ Status StructFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumn return Status::OK(); } - auto& column_struct = assert_cast<vectorized::ColumnStruct&>( + auto& column_struct = assert_cast<vectorized::ColumnStruct&, TypeCheckOnRelease::DISABLE>( dst->is_nullable() ? static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column() : *dst); for (size_t i = 0; i < column_struct.tuple_size(); i++) { @@ -1346,7 +1348,8 @@ Status StructFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumn RETURN_IF_ERROR( _null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); } else { - auto& null_map = assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr); + auto& null_map = assert_cast<vectorized::ColumnUInt8&, TypeCheckOnRelease::DISABLE>( + *null_map_ptr); null_map.insert_many_vals(0, num_read); } DCHECK(num_read == *n); @@ -1378,12 +1381,33 @@ Status StructFileColumnIterator::read_by_rowids(const rowid_t* rowids, const siz return Status::OK(); } - for (size_t i = 0; i < count; ++i) { - RETURN_IF_ERROR(seek_to_ordinal(rowids[i])); - size_t num_read = 1; + if (count == 0) { + return Status::OK(); + } + + size_t this_run = 1; + auto start_idx = rowids[0]; + auto last_idx = rowids[0]; + for (size_t i = 1; i < count; ++i) { + if (last_idx == rowids[i] - 1) { + last_idx = rowids[i]; + this_run++; + continue; + } + RETURN_IF_ERROR(seek_to_ordinal(start_idx)); + size_t num_read = this_run; RETURN_IF_ERROR(next_batch(&num_read, dst, nullptr)); - DCHECK(num_read == 1); + DCHECK_EQ(num_read, this_run); + + start_idx = rowids[i]; + last_idx = rowids[i]; + this_run = 1; } + + RETURN_IF_ERROR(seek_to_ordinal(start_idx)); + size_t num_read = this_run; + RETURN_IF_ERROR(next_batch(&num_read, dst, nullptr)); + DCHECK_EQ(num_read, this_run); return Status::OK(); } @@ -1485,8 +1509,9 @@ Status OffsetFileColumnIterator::_peek_one_offset(ordinal_t* offset) { _peek_tmp_col->clear(); RETURN_IF_ERROR(offset_page_decoder->peek_next_batch(&n, _peek_tmp_col)); // not null DCHECK(_peek_tmp_col->size() == 1); - *offset = - assert_cast<const vectorized::ColumnOffset64*>(_peek_tmp_col.get())->get_element(0); + *offset = assert_cast<const vectorized::ColumnOffset64*, TypeCheckOnRelease::DISABLE>( + _peek_tmp_col.get()) + ->get_element(0); } else { *offset = _offset_iterator->get_current_page()->next_array_item_ordinal; } @@ -1617,7 +1642,8 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnP RETURN_IF_ERROR( _null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); } else { - auto& null_map = assert_cast<vectorized::ColumnUInt8&>(*null_map_ptr); + auto& null_map = assert_cast<vectorized::ColumnUInt8&, TypeCheckOnRelease::DISABLE>( + *null_map_ptr); null_map.insert_many_vals(0, num_read); } DCHECK(num_read == *n); @@ -2203,7 +2229,8 @@ void DefaultValueColumnIterator::_insert_many_default(vectorized::MutableColumnP Status RowIdColumnIteratorV2::next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool* has_null) { - auto* string_column = assert_cast<vectorized::ColumnString*>(dst.get()); + auto* string_column = + assert_cast<vectorized::ColumnString*, TypeCheckOnRelease::DISABLE>(dst.get()); for (uint32_t i = 0; i < *n; ++i) { uint32_t row_id = _current_rowid + i; diff --git a/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out b/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out index 86728bafd1c..b3312aa670c 100644 --- a/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out +++ b/regression-test/data/datatype_p0/complex_types/test_pruned_columns.out @@ -1,7 +1,18 @@ -- This file is automatically generated. You should know what you did if you want to edit this -- !sql -- -1 {"city":"beijing", "data":[{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}]} -2 {"city":"shanghai", "data":[{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}]} +1 {"city":"beijing", "data":[{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}], "value":1} +2 {"city":"shanghai", "data":[{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}], "value":2} +3 {"city":"guangzhou", "data":[{1:{"a":90, "b":60}, 2:{"a":110, "b":40}}], "value":3} +4 {"city":"shenzhen", "data":[{2:{"a":130, "b":20}, 1:{"a":150, "b":40}}], "value":4} +5 {"city":"hangzhou", "data":[{1:{"a":170, "b":80}, 2:{"a":190, "b":40}}], "value":5} +6 {"city":"nanjing", "data":[{2:{"a":210, "b":60}, 1:{"a":230, "b":40}}], "value":6} +7 {"city":"tianjin", "data":[{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}], "value":7} +8 {"city":"chongqing", "data":[{2:{"a":290, "b":80}, 1:{"a":310, "b":40}}], "value":8} +9 {"city":"wuhan", "data":[{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}], "value":9} +10 {"city":"xian", "data":[{2:{"a":370, "b":20}, 1:{"a":390, "b":40}}], "value":10} +11 {"city":"changsha", "data":[{1:{"a":410, "b":80}, 2:{"a":430, "b":40}}], "value":11} +12 {"city":"qingdao", "data":[{2:{"a":450, "b":60}, 1:{"a":470, "b":40}}], "value":12} +13 {"city":"dalian", "data":[{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}], "value":13} -- !sql1 -- 1 [10] @@ -9,18 +20,58 @@ -- !sql2 -- 1 beijing 2 shanghai +3 guangzhou +4 shenzhen +5 hangzhou +6 nanjing +7 tianjin +8 chongqing +9 wuhan +10 xian +11 changsha +12 qingdao +13 dalian -- !sql3 -- 1 [{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}] 2 [{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}] +3 [{1:{"a":90, "b":60}, 2:{"a":110, "b":40}}] +4 [{2:{"a":130, "b":20}, 1:{"a":150, "b":40}}] +5 [{1:{"a":170, "b":80}, 2:{"a":190, "b":40}}] +6 [{2:{"a":210, "b":60}, 1:{"a":230, "b":40}}] +7 [{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}] +8 [{2:{"a":290, "b":80}, 1:{"a":310, "b":40}}] +9 [{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}] +10 [{2:{"a":370, "b":20}, 1:{"a":390, "b":40}}] +11 [{1:{"a":410, "b":80}, 2:{"a":430, "b":40}}] +12 [{2:{"a":450, "b":60}, 1:{"a":470, "b":40}}] +13 [{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}] -- !sql4 -- 1 [{1:{"a":10, "b":20}, 2:{"a":30, "b":40}}] 2 [{2:{"a":50, "b":40}, 1:{"a":70, "b":80}}] +3 [{1:{"a":90, "b":60}, 2:{"a":110, "b":40}}] +5 [{1:{"a":170, "b":80}, 2:{"a":190, "b":40}}] +7 [{1:{"a":250, "b":20}, 2:{"a":270, "b":40}}] +9 [{1:{"a":330, "b":60}, 2:{"a":350, "b":40}}] +11 [{1:{"a":410, "b":80}, 2:{"a":430, "b":40}}] +13 [{1:{"a":490, "b":20}, 2:{"a":510, "b":40}}] -- !sql5 -- 1 beijing 2 shanghai +3 guangzhou +5 hangzhou +7 tianjin +9 wuhan +11 changsha +13 dalian + +-- !sql5_1 -- +61 + +-- !sql5_2 -- +61 -- !sql6 -- 2 diff --git a/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy b/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy index c2a7e2b7146..a99b7b6da59 100644 --- a/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy +++ b/regression-test/suites/datatype_p0/complex_types/test_pruned_columns.groovy @@ -20,7 +20,7 @@ suite("test_pruned_columns") { sql """ CREATE TABLE `tbl_test_pruned_columns` ( `id` int NULL, - `s` struct<city:text,data:array<map<int,struct<a:int,b:double>>>> NULL + `s` struct<city:text,data:array<map<int,struct<a:int,b:double>>>, value:int> NULL ) ENGINE=OLAP DUPLICATE KEY(`id`) DISTRIBUTED BY RANDOM BUCKETS AUTO @@ -31,8 +31,19 @@ suite("test_pruned_columns") { sql """ insert into `tbl_test_pruned_columns` values - (1, named_struct('city', 'beijing', 'data', array(map(1, named_struct('a', 10, 'b', 20.0), 2, named_struct('a', 30, 'b', 40))))), - (2, named_struct('city', 'shanghai', 'data', array(map(2, named_struct('a', 50, 'b', 40.0), 1, named_struct('a', 70, 'b', 80))))); + (1, named_struct('city', 'beijing', 'data', array(map(1, named_struct('a', 10, 'b', 20.0), 2, named_struct('a', 30, 'b', 40))), 'value', 1)), + (2, named_struct('city', 'shanghai', 'data', array(map(2, named_struct('a', 50, 'b', 40.0), 1, named_struct('a', 70, 'b', 80))), 'value', 2)), + (3, named_struct('city', 'guangzhou', 'data', array(map(1, named_struct('a', 90, 'b', 60.0), 2, named_struct('a', 110, 'b', 40))), 'value', 3)), + (4, named_struct('city', 'shenzhen', 'data', array(map(2, named_struct('a', 130, 'b', 20.0), 1, named_struct('a', 150, 'b', 40))), 'value', 4)), + (5, named_struct('city', 'hangzhou', 'data', array(map(1, named_struct('a', 170, 'b', 80.0), 2, named_struct('a', 190, 'b', 40))), 'value', 5)), + (6, named_struct('city', 'nanjing', 'data', array(map(2, named_struct('a', 210, 'b', 60.0), 1, named_struct('a', 230, 'b', 40))), 'value', 6)), + (7, named_struct('city', 'tianjin', 'data', array(map(1, named_struct('a', 250, 'b', 20.0), 2, named_struct('a', 270, 'b', 40))), 'value', 7)), + (8, named_struct('city', 'chongqing', 'data', array(map(2, named_struct('a', 290, 'b', 80.0), 1, named_struct('a', 310, 'b', 40))), 'value', 8)), + (9, named_struct('city', 'wuhan', 'data', array(map(1, named_struct('a', 330, 'b', 60.0), 2, named_struct('a', 350, 'b', 40))), 'value', 9)), + (10, named_struct('city', 'xian', 'data', array(map(2, named_struct('a', 370, 'b', 20.0), 1, named_struct('a', 390, 'b', 40))), 'value', 10)), + (11, named_struct('city', 'changsha', 'data', array(map(1, named_struct('a', 410, 'b', 80.0), 2, named_struct('a', 430, 'b', 40))), 'value', 11)), + (12, named_struct('city', 'qingdao', 'data', array(map(2, named_struct('a', 450, 'b', 60.0), 1, named_struct('a', 470, 'b', 40))), 'value', 12)), + (13, named_struct('city', 'dalian', 'data', array(map(1, named_struct('a', 490, 'b', 20.0), 2, named_struct('a', 510, 'b', 40))), 'value', 13)); """ qt_sql """ @@ -59,6 +70,14 @@ suite("test_pruned_columns") { select id, struct_element(s, 'city') from `tbl_test_pruned_columns` where struct_element(struct_element(s, 'data')[1][2], 'b') = 40 order by 1; """ + qt_sql5_1 """ + select /*+ set enable_prune_nested_column = 1; */ sum(s.value) from `tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13); + """ + + qt_sql5_2 """ + select /*+ set enable_prune_nested_column = 0; */ sum(s.value) from `tbl_test_pruned_columns` where id in(1,2,3,4,8,9,10,11,13); + """ + sql """DROP TABLE IF EXISTS `tbl_test_pruned_columns_map`""" sql """ CREATE TABLE `tbl_test_pruned_columns_map` ( --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
