This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.1-lts
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-1.1-lts by this push:
     new 48f70cf54c [improvement](scanner) using avg rowset to calculate batch 
size instead of using total_bytes since it costs a lot of cpu (#14273)
48f70cf54c is described below

commit 48f70cf54c1e43e593249d23413ccaf874ae3cbe
Author: yiguolei <676222...@qq.com>
AuthorDate: Thu Nov 17 09:23:54 2022 +0800

    [improvement](scanner) using avg rowset to calculate batch size instead of 
using total_bytes since it costs a lot of cpu (#14273)
---
 be/src/exec/olap_scanner.cpp | 19 +++++++++++++++----
 be/src/exec/olap_scanner.h   |  5 ++++-
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp
index 413ebee301..5fe1d551d3 100644
--- a/be/src/exec/olap_scanner.cpp
+++ b/be/src/exec/olap_scanner.cpp
@@ -210,14 +210,14 @@ Status OlapScanner::_init_tablet_reader_params(
             bool has_replace_col = false;
             for (auto col : _return_columns) {
                 if (_tablet->tablet_schema().column(col).aggregation() ==
-                        
FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE) {
+                    FieldAggregationMethod::OLAP_FIELD_AGGREGATION_REPLACE) {
                     has_replace_col = true;
                     break;
                 }
             }
             if (auto sequence_col_idx = 
_tablet->tablet_schema().sequence_col_idx();
-                    has_replace_col && std::find(_return_columns.begin(), 
_return_columns.end(),
-                        sequence_col_idx) == _return_columns.end()) {
+                has_replace_col && std::find(_return_columns.begin(), 
_return_columns.end(),
+                                             sequence_col_idx) == 
_return_columns.end()) {
                 
_tablet_reader_params.return_columns.push_back(sequence_col_idx);
             }
         }
@@ -292,7 +292,18 @@ Status OlapScanner::get_batch(RuntimeState* state, 
RowBatch* batch, bool* eof) {
             // Use total_byte_size here, not tuple_pool's allocated bytes, 
because we preallocated tuple pool at beginning
             // its size maybe larger than threshold, so that scanner will 
break here and may dead loop.
             // Not need check num_rows > 0, because total_byte_size() == 0  if 
num_rows == 0.
-            if (batch->is_full() || batch->total_byte_size() >= 
raw_bytes_threshold ||
+            if (_avg_row_size == 0 && batch->num_rows() > 0) {
+                // total_byte_size() cost a lot of CPU time, so that compute 
avg row size here.
+                _first_batch_row_num += batch->num_rows();
+                _first_batch_size += batch->total_byte_size();
+                // Accumulate many batches and then calculate avg row size to 
avoid there are only small number of rows
+                if (_first_batch_size > raw_bytes_threshold) {
+                    _avg_row_size = _first_batch_size / _first_batch_row_num;
+                }
+            }
+            int64_t batch_total_bytes = _avg_row_size > 0 ? _avg_row_size * 
batch->num_rows()
+                                                          : 
batch->total_byte_size();
+            if (batch->is_full() || batch_total_bytes >= raw_bytes_threshold ||
                 raw_rows_read() >= raw_rows_threshold) {
                 _update_realtime_counter();
                 break;
diff --git a/be/src/exec/olap_scanner.h b/be/src/exec/olap_scanner.h
index ce758365da..fbfccc8bcd 100644
--- a/be/src/exec/olap_scanner.h
+++ b/be/src/exec/olap_scanner.h
@@ -33,9 +33,9 @@
 #include "gen_cpp/PlanNodes_types.h"
 #include "olap/delete_handler.h"
 #include "olap/olap_cond.h"
-#include "olap/tuple_reader.h"
 #include "olap/rowset/column_data.h"
 #include "olap/storage_engine.h"
+#include "olap/tuple_reader.h"
 #include "runtime/descriptors.h"
 #include "runtime/tuple.h"
 #include "runtime/vectorized_row_batch.h"
@@ -142,6 +142,9 @@ protected:
     int64_t _num_rows_read = 0;
     int64_t _raw_rows_read = 0;
     int64_t _compressed_bytes_read = 0;
+    int64_t _avg_row_size = 0;
+    int64_t _first_batch_row_num = 0;
+    int64_t _first_batch_size = 0;
 
     size_t _batch_size = 0;
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to