This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-1.1-lts in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-1.1-lts by this push: new 48f70cf54c [improvement](scanner) using avg rowset to calculate batch size instead of using total_bytes since it costs a lot of cpu (#14273) 48f70cf54c is described below commit 48f70cf54c1e43e593249d23413ccaf874ae3cbe Author: yiguolei <676222...@qq.com> AuthorDate: Thu Nov 17 09:23:54 2022 +0800 [improvement](scanner) using avg rowset to calculate batch size instead of using total_bytes since it costs a lot of cpu (#14273) --- be/src/exec/olap_scanner.cpp | 19 +++++++++++++++---- be/src/exec/olap_scanner.h | 5 ++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp index 413ebee301..5fe1d551d3 100644 --- a/be/src/exec/olap_scanner.cpp +++ b/be/src/exec/olap_scanner.cpp @@ -210,14 +210,14 @@ Status OlapScanner::_init_tablet_reader_params( bool has_replace_col = false; for (auto col : _return_columns) { if (_tablet->tablet_schema().column(col).aggregation() == - FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE) { + FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE) { has_replace_col = true; break; } } if (auto sequence_col_idx = _tablet->tablet_schema().sequence_col_idx(); - has_replace_col && std::find(_return_columns.begin(), _return_columns.end(), - sequence_col_idx) == _return_columns.end()) { + has_replace_col && std::find(_return_columns.begin(), _return_columns.end(), + sequence_col_idx) == _return_columns.end()) { _tablet_reader_params.return_columns.push_back(sequence_col_idx); } } @@ -292,7 +292,18 @@ Status OlapScanner::get_batch(RuntimeState* state, RowBatch* batch, bool* eof) { // Use total_byte_size here, not tuple_pool's allocated bytes, because we preallocated tuple pool at beginning // its size maybe larger than threshold, so that scanner will break here and may dead loop. // Not need check num_rows > 0, because total_byte_size() == 0 if num_rows == 0. - if (batch->is_full() || batch->total_byte_size() >= raw_bytes_threshold || + if (_avg_row_size == 0 && batch->num_rows() > 0) { + // total_byte_size() cost a lot of CPU time, so that compute avg row size here. + _first_batch_row_num += batch->num_rows(); + _first_batch_size += batch->total_byte_size(); + // Accumulate many batches and then calculate avg row size to avoid there are only small number of rows + if (_first_batch_size > raw_bytes_threshold) { + _avg_row_size = _first_batch_size / _first_batch_row_num; + } + } + int64_t batch_total_bytes = _avg_row_size > 0 ? _avg_row_size * batch->num_rows() + : batch->total_byte_size(); + if (batch->is_full() || batch_total_bytes >= raw_bytes_threshold || raw_rows_read() >= raw_rows_threshold) { _update_realtime_counter(); break; diff --git a/be/src/exec/olap_scanner.h b/be/src/exec/olap_scanner.h index ce758365da..fbfccc8bcd 100644 --- a/be/src/exec/olap_scanner.h +++ b/be/src/exec/olap_scanner.h @@ -33,9 +33,9 @@ #include "gen_cpp/PlanNodes_types.h" #include "olap/delete_handler.h" #include "olap/olap_cond.h" -#include "olap/tuple_reader.h" #include "olap/rowset/column_data.h" #include "olap/storage_engine.h" +#include "olap/tuple_reader.h" #include "runtime/descriptors.h" #include "runtime/tuple.h" #include "runtime/vectorized_row_batch.h" @@ -142,6 +142,9 @@ protected: int64_t _num_rows_read = 0; int64_t _raw_rows_read = 0; int64_t _compressed_bytes_read = 0; + int64_t _avg_row_size = 0; + int64_t _first_batch_row_num = 0; + int64_t _first_batch_size = 0; size_t _batch_size = 0; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org