zuochunwei commented on a change in pull request #7613: URL: https://github.com/apache/incubator-doris/pull/7613#discussion_r781978307
########## File path: be/src/olap/rowset/segment_v2/segment_iterator.cpp ########## @@ -581,9 +586,358 @@ Status SegmentIterator::next_batch(RowBlockV2* block) { return Status::OK(); } +/* ---------------------- for vecterization implementation ---------------------- */ + +// todo(wb) need a UT here +void SegmentIterator::_vec_init_lazy_materialization() { + _is_pred_column.resize(_schema.columns().size(), false); + + std::set<ColumnId> pred_column_ids; // including short_cir_pred_col_id_set and vec_pred_col_id_set + _is_all_column_basic_type = true; + bool is_predicate_column_exists = false; + bool is_non_predicate_column_exists = false; + + if (!_col_predicates.empty()) { + is_predicate_column_exists = true; + + std::set<ColumnId> short_cir_pred_col_id_set; // using set for distinct cid + std::set<ColumnId> vec_pred_col_id_set; + + for (auto predicate : _col_predicates) { + auto cid = predicate->column_id(); + FieldType type = _schema.column(cid)->type(); + _is_pred_column[cid] = true; + pred_column_ids.insert(cid); + + // for date type which can not be executed in a vectorized way, using short circuit execution + if (type == OLAP_FIELD_TYPE_VARCHAR || type == OLAP_FIELD_TYPE_CHAR || type == OLAP_FIELD_TYPE_DECIMAL + || type == OLAP_FIELD_TYPE_DATE || predicate->is_in_predicate()) { + short_cir_pred_col_id_set.insert(cid); + _short_cir_eval_predicate.push_back(predicate); + _is_all_column_basic_type = false; + } else { + vec_pred_col_id_set.insert(predicate->column_id()); + if (_pre_eval_block_predicate == nullptr) { + _pre_eval_block_predicate = new AndBlockColumnPredicate(); + } + reinterpret_cast<MutilColumnBlockPredicate*>(_pre_eval_block_predicate)->add_column_predicate(new SingleColumnBlockPredicate(predicate)); + } + } + + std::set<ColumnId> del_cond_id_set; + _opts.delete_condition_predicates.get()->get_all_column_ids(del_cond_id_set); + short_cir_pred_col_id_set.insert(del_cond_id_set.begin(), del_cond_id_set.end()); + pred_column_ids.insert(del_cond_id_set.begin(), del_cond_id_set.end()); + + if (_schema.column_ids().size() > pred_column_ids.size()) { + for (auto cid : _schema.column_ids()) { + if (!_is_pred_column[cid]) { + _non_predicate_columns.push_back(cid); + is_non_predicate_column_exists = true; + } + } + } + + _vec_pred_column_ids.assign(vec_pred_col_id_set.cbegin(), vec_pred_col_id_set.cend()); + _short_cir_pred_column_ids.assign(short_cir_pred_col_id_set.cbegin(), short_cir_pred_col_id_set.cend()); + } else { + _is_all_column_basic_type = false; + is_non_predicate_column_exists = true; + for (auto cid : _schema.column_ids()) { + _non_predicate_columns.push_back(cid); + } + } + + // note(wb) in following cases we disable lazy materialization + // case 1: when all column is basic type(is_all_column_basic_type = true) + // because we think `seek and read` cost > read page cost, lazy materialize may cause more `seek and read`, so disable it + // case 2: all column is predicate column + // case 3: all column is not predicate column + // todo(wb) need further research more lazy materialization rule, such as get more info from `statistics` for better decision + if (_is_all_column_basic_type) { + std::set<ColumnId> pred_set(_vec_pred_column_ids.begin(), _vec_pred_column_ids.end()); + std::set<ColumnId> non_pred_set(_non_predicate_columns.begin(), _non_predicate_columns.end()); + + // when _is_all_column_basic_type = true, _first_read_column_ids should keep the same order with _schema.column_ids which stands for return column order + for (int i = 0; i < _schema.num_column_ids(); i++) { + auto cid = _schema.column_ids()[i]; + if (pred_set.find(cid) != pred_set.end()) { + _first_read_column_ids.push_back(cid); + } else if (non_pred_set.find(cid) != non_pred_set.end()) { + _first_read_column_ids.push_back(cid); + _is_pred_column[cid] = true; // in this case, non-predicate column should also be filtered by sel idx, so we regard it as pred columns + } + } + + } else if (is_predicate_column_exists && !is_non_predicate_column_exists) { + _first_read_column_ids.assign(pred_column_ids.cbegin(), pred_column_ids.cend()); + } else if (!is_predicate_column_exists && is_non_predicate_column_exists) { + for (auto cid : _non_predicate_columns) { + _first_read_column_ids.push_back(cid); + } + } else { + _lazy_materialization_read = true; + _first_read_column_ids.assign(pred_column_ids.cbegin(), pred_column_ids.cend()); + } + + // make _schema_block_id_map + _schema_block_id_map.resize(_schema.columns().size()); + for (int i = 0; i < _schema.num_column_ids(); i++) { + auto cid = _schema.column_ids()[i]; + _schema_block_id_map[cid] = i; + } + +} + +Status SegmentIterator::_read_columns(const std::vector<ColumnId>& column_ids, vectorized::MutableColumns& column_block, size_t nrows) { + for (auto cid : column_ids) { + auto& column = column_block[cid]; + size_t rows_read = nrows; + RETURN_IF_ERROR(_column_iterators[cid]->next_batch(&rows_read, column)); + DCHECK_EQ(nrows, rows_read); + } + return Status::OK(); +} + +void SegmentIterator::_init_current_block(vectorized::Block* block, std::vector<vectorized::MutableColumnPtr>& current_columns) { Review comment: why pass member data _current_return_columns as member functionâs argument? why not use _current_return_columns in _init_current_block directly? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org