[GitHub] [incubator-doris] zuochunwei commented on a change in pull request #7613: [Vectorization] support SegmentIterator vectorization

GitBox Tue, 11 Jan 2022 02:31:29 -0800


zuochunwei commented on a change in pull request #7613:
URL: https://github.com/apache/incubator-doris/pull/7613#discussion_r781978307




##########
File path: be/src/olap/rowset/segment_v2/segment_iterator.cpp
##########
@@ -581,9 +586,358 @@ Status SegmentIterator::next_batch(RowBlockV2* block) {
     return Status::OK();
 }
 
+/* ---------------------- for vecterization implementation  
---------------------- */
+
+// todo(wb) need a UT here
+void SegmentIterator::_vec_init_lazy_materialization() {
+    _is_pred_column.resize(_schema.columns().size(), false);
+
+    std::set<ColumnId> pred_column_ids; // including short_cir_pred_col_id_set 
and vec_pred_col_id_set
+    _is_all_column_basic_type = true;
+    bool is_predicate_column_exists = false;
+    bool is_non_predicate_column_exists = false;
+    
+    if (!_col_predicates.empty()) {
+        is_predicate_column_exists = true;
+
+        std::set<ColumnId> short_cir_pred_col_id_set; // using set for 
distinct cid
+        std::set<ColumnId> vec_pred_col_id_set;
+        
+        for (auto predicate : _col_predicates) {
+            auto cid = predicate->column_id();
+            FieldType type = _schema.column(cid)->type();
+            _is_pred_column[cid] = true;
+            pred_column_ids.insert(cid);
+
+            // for date type which can not be executed in a vectorized way, 
using short circuit execution
+            if (type == OLAP_FIELD_TYPE_VARCHAR || type == 
OLAP_FIELD_TYPE_CHAR || type == OLAP_FIELD_TYPE_DECIMAL
+                || type == OLAP_FIELD_TYPE_DATE || 
predicate->is_in_predicate()) {
+                short_cir_pred_col_id_set.insert(cid);
+                _short_cir_eval_predicate.push_back(predicate);
+                _is_all_column_basic_type = false;
+            } else {
+                vec_pred_col_id_set.insert(predicate->column_id());
+                if (_pre_eval_block_predicate == nullptr) {
+                    _pre_eval_block_predicate = new AndBlockColumnPredicate();
+                }
+                
reinterpret_cast<MutilColumnBlockPredicate*>(_pre_eval_block_predicate)->add_column_predicate(new
 SingleColumnBlockPredicate(predicate));
+            }
+        }
+
+        std::set<ColumnId> del_cond_id_set;
+        
_opts.delete_condition_predicates.get()->get_all_column_ids(del_cond_id_set);
+        short_cir_pred_col_id_set.insert(del_cond_id_set.begin(), 
del_cond_id_set.end());
+        pred_column_ids.insert(del_cond_id_set.begin(), del_cond_id_set.end());
+
+        if (_schema.column_ids().size() > pred_column_ids.size()) {
+            for (auto cid : _schema.column_ids()) {
+                if (!_is_pred_column[cid]) {
+                    _non_predicate_columns.push_back(cid);
+                    is_non_predicate_column_exists = true;
+                }
+            }
+        }
+
+        _vec_pred_column_ids.assign(vec_pred_col_id_set.cbegin(), 
vec_pred_col_id_set.cend());
+        _short_cir_pred_column_ids.assign(short_cir_pred_col_id_set.cbegin(), 
short_cir_pred_col_id_set.cend());
+    } else {
+        _is_all_column_basic_type = false;
+        is_non_predicate_column_exists = true;
+        for (auto cid : _schema.column_ids()) {
+            _non_predicate_columns.push_back(cid);
+        }
+    }
+
+    // note(wb) in following cases we disable lazy materialization
+    // case 1: when all column is basic type(is_all_column_basic_type = true) 
+    //   because we think `seek and read` cost > read page cost, lazy 
materialize may cause more `seek and read`, so disable it
+    // case 2: all column is predicate column
+    // case 3: all column is not predicate column
+    // todo(wb) need further research more lazy materialization rule, such as 
get more info from `statistics` for better decision
+    if (_is_all_column_basic_type) {
+        std::set<ColumnId> pred_set(_vec_pred_column_ids.begin(), 
_vec_pred_column_ids.end());
+        std::set<ColumnId> non_pred_set(_non_predicate_columns.begin(), 
_non_predicate_columns.end());
+
+        // when _is_all_column_basic_type = true, _first_read_column_ids 
should keep the same order with _schema.column_ids which stands for return 
column order
+        for (int i = 0; i < _schema.num_column_ids(); i++) {
+            auto cid = _schema.column_ids()[i];
+            if (pred_set.find(cid) != pred_set.end()) {
+                _first_read_column_ids.push_back(cid);    
+            } else if (non_pred_set.find(cid) != non_pred_set.end()) {
+                _first_read_column_ids.push_back(cid);
+                _is_pred_column[cid] = true; // in this case, non-predicate 
column should also be filtered by sel idx, so we regard it as pred columns
+            }
+        }
+
+    } else if (is_predicate_column_exists && !is_non_predicate_column_exists) {
+        _first_read_column_ids.assign(pred_column_ids.cbegin(), 
pred_column_ids.cend());
+    } else if (!is_predicate_column_exists && is_non_predicate_column_exists) {
+        for (auto cid : _non_predicate_columns) {
+            _first_read_column_ids.push_back(cid);
+        }
+    } else {
+        _lazy_materialization_read = true;
+        _first_read_column_ids.assign(pred_column_ids.cbegin(), 
pred_column_ids.cend());
+    }
+
+    // make _schema_block_id_map
+    _schema_block_id_map.resize(_schema.columns().size());
+    for (int i = 0; i < _schema.num_column_ids(); i++) {
+        auto cid = _schema.column_ids()[i];
+        _schema_block_id_map[cid] = i;
+    }
+
+}
+
+Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids, 
vectorized::MutableColumns& column_block, size_t nrows) {
+    for (auto cid : column_ids) {
+        auto& column = column_block[cid];
+        size_t rows_read = nrows;
+        RETURN_IF_ERROR(_column_iterators[cid]->next_batch(&rows_read, 
column));
+        DCHECK_EQ(nrows, rows_read);
+    }
+    return Status::OK();
+}
+
+void SegmentIterator::_init_current_block(vectorized::Block* block, 
std::vector<vectorized::MutableColumnPtr>& current_columns) {

Review comment:
       why pass member data _current_return_columns as member function‘s 
argument?
   why not use _current_return_columns in _init_current_block directly?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[GitHub] [incubator-doris] zuochunwei commented on a change in pull request #7613: [Vectorization] support SegmentIterator vectorization

Reply via email to