This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 904e757679 [improvement]Do not lazily read dict encoded columns (#10420) 904e757679 is described below commit 904e7576797c796b809823647a769bc1d4569115 Author: Jerry Hu <mrh...@gmail.com> AuthorDate: Sun Jun 26 22:08:48 2022 +0800 [improvement]Do not lazily read dict encoded columns (#10420) --- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 127 +++++++++++---------- be/src/olap/rowset/segment_v2/segment_iterator.h | 8 +- 2 files changed, 71 insertions(+), 64 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 61adede9f2..cc0e686131 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -643,13 +643,18 @@ Status SegmentIterator::next_batch(RowBlockV2* block) { // todo(wb) need a UT here void SegmentIterator::_vec_init_lazy_materialization() { - _is_pred_column.resize(_schema.columns().size(), false); + _is_first_read_column.resize(_schema.columns().size(), false); // including short/vec/delete pred std::set<ColumnId> pred_column_ids; + + // Other columns(not predicate columns) can read firstly together with predicate columns + std::set<ColumnId> other_first_read_column_ids; + _lazy_materialization_read = false; std::set<ColumnId> del_cond_id_set; + std::set<ColumnId> lazy_read_column_ids; _opts.delete_condition_predicates->get_all_column_ids(del_cond_id_set); if (!_col_predicates.empty() || !del_cond_id_set.empty()) { @@ -658,7 +663,7 @@ void SegmentIterator::_vec_init_lazy_materialization() { for (auto predicate : _col_predicates) { auto cid = predicate->column_id(); - _is_pred_column[cid] = true; + _is_first_read_column[cid] = true; pred_column_ids.insert(cid); // Step1: check pred using short eval or vec eval @@ -681,7 +686,7 @@ void SegmentIterator::_vec_init_lazy_materialization() { pred_column_ids.insert(del_cond_id_set.begin(), del_cond_id_set.end()); for (auto cid : del_cond_id_set) { - _is_pred_column[cid] = true; + _is_first_read_column[cid] = true; } } @@ -698,65 +703,58 @@ void SegmentIterator::_vec_init_lazy_materialization() { } // Step 2: check non-predicate read costs to determine whether need lazy materialization - // fill _non_predicate_columns. // note(wb) For block schema, query layer and storage layer may have some diff // query layer block schema not contains delete column, but storage layer appends delete column to end of block schema // When output block to query layer, delete column can be skipped. // _schema.column_ids() stands for storage layer block schema, so it contains delete columnid // we just regard delete column as common pred column here. - if (_schema.column_ids().size() > pred_column_ids.size()) { - for (auto cid : _schema.column_ids()) { - if (!_is_pred_column[cid]) { - _non_predicate_columns.push_back(cid); - FieldType type = _schema.column(cid)->type(); - - // todo(wb) maybe we can make read char type faster - // todo(wb) support map/array type - // todo(wb) consider multiple integer columns cost, such as 1000 columns, maybe lazy materialization faster - if (!_lazy_materialization_read && - (_is_need_vec_eval || - _is_need_short_eval) && // only when pred exists, we need to consider lazy materialization - (type == OLAP_FIELD_TYPE_HLL || type == OLAP_FIELD_TYPE_OBJECT || - type == OLAP_FIELD_TYPE_VARCHAR || type == OLAP_FIELD_TYPE_CHAR || - type == OLAP_FIELD_TYPE_STRING || type == OLAP_FIELD_TYPE_BOOL || - type == OLAP_FIELD_TYPE_DATE || type == OLAP_FIELD_TYPE_DATETIME || - type == OLAP_FIELD_TYPE_DECIMAL)) { - _lazy_materialization_read = true; + for (size_t i = 0; i < _schema.num_column_ids(); ++i) { + auto cid = _schema.column_id(i); + FieldType type = _schema.column(cid)->type(); + if (!_is_first_read_column[cid]) { + switch (type) { + case OLAP_FIELD_TYPE_VARCHAR: + case OLAP_FIELD_TYPE_CHAR: + case OLAP_FIELD_TYPE_STRING: { + // if a string column is all dict encoding in one segment, it's almost same as + // an int32_t column, it can be read together with predicate columns. + if (config::enable_low_cardinality_optimize && + _column_iterators[cid]->is_all_dict_encoding()) { + other_first_read_column_ids.insert(cid); + _is_first_read_column[cid] = true; + } else { + lazy_read_column_ids.insert(cid); } + break; + } + case OLAP_FIELD_TYPE_HLL: + case OLAP_FIELD_TYPE_OBJECT: + case OLAP_FIELD_TYPE_BOOL: + case OLAP_FIELD_TYPE_DATE: + case OLAP_FIELD_TYPE_DATETIME: + case OLAP_FIELD_TYPE_DECIMAL: + lazy_read_column_ids.insert(cid); + break; + default: + other_first_read_column_ids.insert(cid); + _is_first_read_column[cid] = true; + break; } } } - // Step 3: fill column ids for read and output - if (_lazy_materialization_read) { - // insert pred cid to first_read_columns - for (auto cid : pred_column_ids) { - _first_read_column_ids.push_back(cid); - } - } else if (!_is_need_vec_eval && - !_is_need_short_eval) { // no pred exists, just read and output column - for (int i = 0; i < _schema.num_column_ids(); i++) { - auto cid = _schema.column_id(i); - _first_read_column_ids.push_back(cid); - } - } else { // pred exits, but we can eliminate lazy materialization - // insert pred/non-pred cid to first read columns - std::set<ColumnId> pred_id_set; - pred_id_set.insert(_short_cir_pred_column_ids.begin(), _short_cir_pred_column_ids.end()); - pred_id_set.insert(_vec_pred_column_ids.begin(), _vec_pred_column_ids.end()); - std::set<ColumnId> non_pred_set(_non_predicate_columns.begin(), - _non_predicate_columns.end()); - - for (int i = 0; i < _schema.num_column_ids(); i++) { - auto cid = _schema.column_id(i); - if (pred_id_set.find(cid) != pred_id_set.end()) { - _first_read_column_ids.push_back(cid); - } else if (non_pred_set.find(cid) != non_pred_set.end()) { - _first_read_column_ids.push_back(cid); - // when _lazy_materialization_read = false, non-predicate column should also be filtered by sel idx, so we regard it as pred columns - _is_pred_column[cid] = true; - } - } + _first_read_column_ids.assign(pred_column_ids.begin(), pred_column_ids.end()); + _first_read_column_ids.insert(_first_read_column_ids.end(), other_first_read_column_ids.begin(), + other_first_read_column_ids.end()); + if (_is_need_vec_eval || _is_need_short_eval) { + _lazy_materialization_read = !lazy_read_column_ids.empty(); + _lazy_read_column_ids.assign(lazy_read_column_ids.begin(), lazy_read_column_ids.end()); + } else { + _lazy_materialization_read = false; + + // No need to lazy read, all columns should be read firstly. + _first_read_column_ids.insert(_first_read_column_ids.end(), lazy_read_column_ids.begin(), + lazy_read_column_ids.end()); } // make _schema_block_id_map @@ -822,7 +820,7 @@ void SegmentIterator::_init_current_block( auto column_desc = _schema.column(cid); // the column in block must clear() here to insert new data - if (_is_pred_column[cid] || + if (_is_first_read_column[cid] || i >= block->columns()) { //todo(wb) maybe we can release it after output block current_columns[cid]->clear(); } else { // non-predicate column @@ -841,9 +839,14 @@ void SegmentIterator::_init_current_block( } } -void SegmentIterator::_output_non_pred_columns(vectorized::Block* block) { +void SegmentIterator::_output_lazy_read_columns(vectorized::Block* block) { + _output_columns_by_relace(block, _lazy_read_column_ids); +} + +void SegmentIterator::_output_columns_by_relace(vectorized::Block* block, + const std::vector<ColumnId>& column_ids) { SCOPED_RAW_TIMER(&_opts.stats->output_col_ns); - for (auto cid : _non_predicate_columns) { + for (auto cid : column_ids) { auto loc = _schema_block_id_map[cid]; // if loc < block->block->columns() means the column is delete column and should // not output by block, so just skip the column. @@ -1004,7 +1007,7 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { for (size_t i = 0; i < _schema.num_column_ids(); i++) { auto cid = _schema.column_id(i); auto column_desc = _schema.column(cid); - if (_is_pred_column[cid]) { + if (_is_first_read_column[cid]) { _current_return_columns[cid] = Schema::get_predicate_column_nullable_ptr( column_desc->type(), column_desc->is_nullable()); _current_return_columns[cid]->reserve(_opts.block_row_max); @@ -1036,7 +1039,7 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { for (int i = 0; i < block->columns(); i++) { auto cid = _schema.column_id(i); // todo(wb) abstract make column where - if (!_is_pred_column[cid]) { // non-predicate + if (!_is_first_read_column[cid]) { // non-predicate block->replace_by_position(i, std::move(_current_return_columns[cid])); } } @@ -1045,7 +1048,7 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { } if (!_is_need_vec_eval && !_is_need_short_eval) { - _output_non_pred_columns(block); + _output_columns_by_relace(block, _first_read_column_ids); } else { uint16_t selected_size = nrows_read; uint16_t sel_rowid_idx[selected_size]; @@ -1070,13 +1073,13 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { return ret; } - // step3: read non_predicate column - _read_columns_by_rowids(_non_predicate_columns, _block_rowids, sel_rowid_idx, selected_size, + // step3: read lazy_read_column_ids column + _read_columns_by_rowids(_lazy_read_column_ids, _block_rowids, sel_rowid_idx, selected_size, &_current_return_columns); // step4: output columns - // 4.1 output non-predicate column - _output_non_pred_columns(block); + // 4.1 output lazy_read_column_ids column + _output_lazy_read_columns(block); // 4.3 output short circuit and predicate column // when lazy materialization enables, _first_read_column_ids = distinct(_short_cir_pred_column_ids + _vec_pred_column_ids) diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index fbf44c34da..541c721dc5 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -101,7 +101,9 @@ private: std::vector<vectorized::MutableColumnPtr>& non_pred_vector); uint16_t _evaluate_vectorization_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size); uint16_t _evaluate_short_circuit_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size); - void _output_non_pred_columns(vectorized::Block* block); + void _output_lazy_read_columns(vectorized::Block* block); + void _output_columns_by_relace(vectorized::Block* block, + const std::vector<ColumnId>& column_ids); void _read_columns_by_rowids(std::vector<ColumnId>& read_column_ids, std::vector<rowid_t>& rowid_vector, uint16_t* sel_rowid_idx, size_t select_size, vectorized::MutableColumns* mutable_columns); @@ -156,7 +158,7 @@ private: _vec_pred_column_ids; // keep columnId of columns for vectorized predicate evaluation std::vector<ColumnId> _short_cir_pred_column_ids; // keep columnId of columns for short circuit predicate evaluation - std::vector<bool> _is_pred_column; // columns hold by segmentIter + std::vector<bool> _is_first_read_column; // columns hold by segmentIter vectorized::MutableColumns _current_return_columns; std::unique_ptr<AndBlockColumnPredicate> _pre_eval_block_predicate; std::vector<ColumnPredicate*> _short_cir_eval_predicate; @@ -165,6 +167,8 @@ private: // second, read non-predicate columns // so we need a field to stand for columns first time to read std::vector<ColumnId> _first_read_column_ids; + std::vector<ColumnId> _lazy_read_column_ids; + std::vector<int> _schema_block_id_map; // map from schema column id to column idx in Block // the actual init process is delayed to the first call to next_batch() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org