This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 18ad8ebfbb [improvement]Add reading by rowids to speed up lazy materialization (#10506) 18ad8ebfbb is described below commit 18ad8ebfbbceda63412df30a76604f205dc9dc55 Author: Jerry Hu <mrh...@gmail.com> AuthorDate: Thu Jun 30 21:03:41 2022 +0800 [improvement]Add reading by rowids to speed up lazy materialization (#10506) --- be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 32 ++++++++++ be/src/olap/rowset/segment_v2/binary_dict_page.h | 3 + be/src/olap/rowset/segment_v2/binary_plain_page.h | 32 ++++++++++ be/src/olap/rowset/segment_v2/bitshuffle_page.h | 26 ++++++++ be/src/olap/rowset/segment_v2/column_reader.cpp | 74 ++++++++++++++++++++++ be/src/olap/rowset/segment_v2/column_reader.h | 8 +++ be/src/olap/rowset/segment_v2/page_decoder.h | 5 ++ be/src/olap/rowset/segment_v2/rle_page.h | 32 ++++++++++ be/src/olap/rowset/segment_v2/segment_iterator.cpp | 42 +++--------- be/src/vec/columns/column_string.h | 15 ++++- 10 files changed, 235 insertions(+), 34 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp index ea9fe66dfc..6fb1258604 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp @@ -261,6 +261,38 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, vectorized::MutableColumnPtr return Status::OK(); } +Status BinaryDictPageDecoder::read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, + size_t* n, vectorized::MutableColumnPtr& dst) { + if (_encoding_type == PLAIN_ENCODING) { + return _data_page_decoder->read_by_rowids(rowids, page_first_ordinal, n, dst); + } + DCHECK(_parsed); + DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr"; + + if (PREDICT_FALSE(*n == 0)) { + *n = 0; + return Status::OK(); + } + + const auto* data_array = reinterpret_cast<const int32_t*>(_bit_shuffle_ptr->get_data(0)); + auto total = *n; + size_t read_count = 0; + int32_t data[total]; + for (size_t i = 0; i < total; ++i) { + ordinal_t ord = rowids[i] - page_first_ordinal; + if (PREDICT_FALSE(ord >= _bit_shuffle_ptr->_num_elements)) { + break; + } + + data[read_count++] = data_array[ord]; + } + + if (LIKELY(read_count > 0)) + dst->insert_many_dict_data(data, 0, _dict_word_info, read_count, _dict_decoder->_num_elems); + *n = read_count; + return Status::OK(); +} + Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) { if (_encoding_type == PLAIN_ENCODING) { return _data_page_decoder->next_batch(n, dst); diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h b/be/src/olap/rowset/segment_v2/binary_dict_page.h index d57f7cf974..21890f2901 100644 --- a/be/src/olap/rowset/segment_v2/binary_dict_page.h +++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h @@ -108,6 +108,9 @@ public: Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst) override; + Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, size_t* n, + vectorized::MutableColumnPtr& dst) override; + size_t count() const override { return _data_page_decoder->count(); } size_t current_index() const override { return _data_page_decoder->current_index(); } diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index 1919efcba8..e4531fe00a 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -271,6 +271,38 @@ public: return Status::OK(); }; + Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, size_t* n, + vectorized::MutableColumnPtr& dst) override { + DCHECK(_parsed); + if (PREDICT_FALSE(*n == 0)) { + *n = 0; + return Status::OK(); + } + + auto total = *n; + size_t read_count = 0; + uint32_t len_array[total]; + uint32_t start_offset_array[total]; + for (size_t i = 0; i < total; ++i) { + ordinal_t ord = rowids[i] - page_first_ordinal; + if (UNLIKELY(ord >= _num_elems)) { + break; + } + + const uint32_t start_offset = offset(ord); + start_offset_array[read_count] = start_offset; + len_array[read_count] = offset(ord + 1) - start_offset; + read_count++; + } + + if (LIKELY(read_count > 0)) + dst->insert_many_binary_data(_data.mutable_data(), len_array, start_offset_array, + read_count); + + *n = read_count; + return Status::OK(); + } + size_t count() const override { DCHECK(_parsed); return _num_elems; diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_page.h b/be/src/olap/rowset/segment_v2/bitshuffle_page.h index 234bda61b6..d0b22f663b 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_page.h +++ b/be/src/olap/rowset/segment_v2/bitshuffle_page.h @@ -408,6 +408,32 @@ public: return Status::OK(); }; + Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, size_t* n, + vectorized::MutableColumnPtr& dst) override { + DCHECK(_parsed); + if (PREDICT_FALSE(*n == 0)) { + *n = 0; + return Status::OK(); + } + + auto total = *n; + auto read_count = 0; + CppType data[total]; + for (size_t i = 0; i < total; ++i) { + ordinal_t ord = rowids[i] - page_first_ordinal; + if (UNLIKELY(ord >= _num_elements)) { + break; + } + + data[read_count++] = *reinterpret_cast<CppType*>(get_data(ord)); + } + + if (LIKELY(read_count > 0)) dst->insert_many_fix_len_data((const char*)data, read_count); + + *n = read_count; + return Status::OK(); + } + Status peek_next_batch(size_t* n, ColumnBlockView* dst) override { return next_batch<false>(n, dst); } diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index a7d8c99899..9087c7b246 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -704,6 +704,80 @@ Status FileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& d return Status::OK(); } +Status FileColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t count, + vectorized::MutableColumnPtr& dst) { + size_t remaining = count; + size_t total_read_count = 0; + size_t nrows_to_read = 0; + while (remaining > 0) { + RETURN_IF_ERROR(seek_to_ordinal(rowids[total_read_count])); + + // number of rows to be read from this page + nrows_to_read = std::min(remaining, _page.remaining()); + + if (_page.has_null) { + size_t already_read = 0; + while ((nrows_to_read - already_read) > 0) { + bool is_null = false; + size_t this_run = std::min(nrows_to_read - already_read, _page.remaining()); + if (UNLIKELY(this_run == 0)) { + break; + } + this_run = _page.null_decoder.GetNextRun(&is_null, this_run); + size_t offset = total_read_count + already_read; + size_t this_read_count = 0; + rowid_t current_ordinal_in_page = _page.offset_in_page + _page.first_ordinal; + for (size_t i = 0; i < this_run; ++i) { + if (rowids[offset + i] - current_ordinal_in_page >= this_run) { + break; + } + this_read_count++; + } + + auto origin_index = _page.data_decoder->current_index(); + if (this_read_count > 0) { + if (is_null) { + auto* null_col = + vectorized::check_and_get_column<vectorized::ColumnNullable>(dst); + if (UNLIKELY(null_col == nullptr)) { + return Status::InternalError("unexpected column type in column reader"); + } + + const_cast<vectorized::ColumnNullable*>(null_col)->insert_null_elements( + this_read_count); + } else { + size_t read_count = this_read_count; + + // ordinal in nullable columns' data buffer maybe be not continuously(the data doesn't contain null value), + // so we need use `page_start_off_in_decoder` to calculate the actual offset in `data_decoder` + size_t page_start_off_in_decoder = + _page.first_ordinal + _page.offset_in_page - origin_index; + RETURN_IF_ERROR(_page.data_decoder->read_by_rowids( + &rowids[offset], page_start_off_in_decoder, &read_count, dst)); + DCHECK_EQ(read_count, this_read_count); + } + } + + if (!is_null) _page.data_decoder->seek_to_position_in_page(origin_index + this_run); + + already_read += this_read_count; + _page.offset_in_page += this_run; + DCHECK(_page.offset_in_page <= _page.num_rows); + } + + nrows_to_read = already_read; + total_read_count += nrows_to_read; + remaining -= nrows_to_read; + } else { + _page.data_decoder->read_by_rowids(&rowids[total_read_count], _page.first_ordinal, + &nrows_to_read, dst); + total_read_count += nrows_to_read; + remaining -= nrows_to_read; + } + } + return Status::OK(); +} + Status FileColumnIterator::_load_next_page(bool* eos) { _page_iter.next(); if (!_page_iter.valid()) { diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index eb1536d69c..ed0303ca9b 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -247,6 +247,11 @@ public: return Status::NotSupported("not implement"); } + virtual Status read_by_rowids(const rowid_t* rowids, const size_t count, + vectorized::MutableColumnPtr& dst) { + return Status::NotSupported("not implement"); + } + virtual ordinal_t get_current_ordinal() const = 0; virtual Status get_row_ranges_by_zone_map(CondColumn* cond_column, CondColumn* delete_condition, @@ -283,6 +288,9 @@ public: Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool* has_null) override; + Status read_by_rowids(const rowid_t* rowids, const size_t count, + vectorized::MutableColumnPtr& dst) override; + ordinal_t get_current_ordinal() const override { return _current_ordinal; } // get row ranges by zone map diff --git a/be/src/olap/rowset/segment_v2/page_decoder.h b/be/src/olap/rowset/segment_v2/page_decoder.h index 3b0861f6b5..7c817a08ac 100644 --- a/be/src/olap/rowset/segment_v2/page_decoder.h +++ b/be/src/olap/rowset/segment_v2/page_decoder.h @@ -86,6 +86,11 @@ public: return Status::NotSupported("not implement vec op now"); } + virtual Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, size_t* n, + vectorized::MutableColumnPtr& dst) { + return Status::NotSupported("not implement vec op now"); + } + // Same as `next_batch` except for not moving forward the cursor. // When read array's ordinals in `ArrayFileColumnIterator`, we want to read one extra ordinal // but do not want to move forward the cursor. diff --git a/be/src/olap/rowset/segment_v2/rle_page.h b/be/src/olap/rowset/segment_v2/rle_page.h index c6eb413aa7..0cd4d3dd2a 100644 --- a/be/src/olap/rowset/segment_v2/rle_page.h +++ b/be/src/olap/rowset/segment_v2/rle_page.h @@ -254,6 +254,38 @@ public: return Status::OK(); }; + Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, size_t* n, + vectorized::MutableColumnPtr& dst) override { + DCHECK(_parsed); + if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) { + *n = 0; + return Status::OK(); + } + + auto total = *n; + bool result = false; + size_t read_count = 0; + CppType value; + for (size_t i = 0; i < total; ++i) { + ordinal_t ord = rowids[i] - page_first_ordinal; + if (UNLIKELY(ord >= _num_elements)) { + *n = read_count; + return Status::OK(); + } + + _rle_decoder.Skip(ord - _cur_index); + _cur_index = ord; + + result = _rle_decoder.Get(&value); + _cur_index++; + DCHECK(result); + dst->insert_data((char*)(&value), SIZE_OF_TYPE); + read_count++; + } + *n = read_count; + return Status::OK(); + } + size_t count() const override { return _num_elements; } size_t current_index() const override { return _cur_index; } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 61adede9f2..d00d1856bd 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -699,28 +699,12 @@ void SegmentIterator::_vec_init_lazy_materialization() { // Step 2: check non-predicate read costs to determine whether need lazy materialization // fill _non_predicate_columns. - // note(wb) For block schema, query layer and storage layer may have some diff - // query layer block schema not contains delete column, but storage layer appends delete column to end of block schema - // When output block to query layer, delete column can be skipped. - // _schema.column_ids() stands for storage layer block schema, so it contains delete columnid - // we just regard delete column as common pred column here. + // After some optimization, we suppose lazy materialization is better performance. if (_schema.column_ids().size() > pred_column_ids.size()) { for (auto cid : _schema.column_ids()) { if (!_is_pred_column[cid]) { _non_predicate_columns.push_back(cid); - FieldType type = _schema.column(cid)->type(); - - // todo(wb) maybe we can make read char type faster - // todo(wb) support map/array type - // todo(wb) consider multiple integer columns cost, such as 1000 columns, maybe lazy materialization faster - if (!_lazy_materialization_read && - (_is_need_vec_eval || - _is_need_short_eval) && // only when pred exists, we need to consider lazy materialization - (type == OLAP_FIELD_TYPE_HLL || type == OLAP_FIELD_TYPE_OBJECT || - type == OLAP_FIELD_TYPE_VARCHAR || type == OLAP_FIELD_TYPE_CHAR || - type == OLAP_FIELD_TYPE_STRING || type == OLAP_FIELD_TYPE_BOOL || - type == OLAP_FIELD_TYPE_DATE || type == OLAP_FIELD_TYPE_DATETIME || - type == OLAP_FIELD_TYPE_DECIMAL)) { + if (_is_need_vec_eval || _is_need_short_eval) { _lazy_materialization_read = true; } } @@ -971,21 +955,13 @@ void SegmentIterator::_read_columns_by_rowids(std::vector<ColumnId>& read_column uint16_t* sel_rowid_idx, size_t select_size, vectorized::MutableColumns* mutable_columns) { SCOPED_RAW_TIMER(&_opts.stats->lazy_read_ns); - size_t start_idx = 0; - while (start_idx < select_size) { - size_t end_idx = start_idx + 1; - while (end_idx < select_size && (rowid_vector[sel_rowid_idx[end_idx - 1]] == - rowid_vector[sel_rowid_idx[end_idx]] - 1)) { - end_idx++; - } - size_t range = end_idx - start_idx; - { - _opts.stats->block_lazy_read_seek_num += 1; - SCOPED_RAW_TIMER(&_opts.stats->block_lazy_read_seek_ns); - _seek_columns(read_column_ids, rowid_vector[sel_rowid_idx[start_idx]]); - } - _read_columns(read_column_ids, *mutable_columns, range); - start_idx += range; + std::vector<rowid_t> rowids(select_size); + for (size_t i = 0; i < select_size; ++i) { + rowids[i] = rowid_vector[sel_rowid_idx[i]]; + } + for (auto cid : read_column_ids) { + auto& column = (*mutable_columns)[cid]; + _column_iterators[cid]->read_by_rowids(rowids.data(), select_size, column); } } diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index 421255a77b..9b92890ed4 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -160,10 +160,23 @@ public: void insert_many_binary_data(char* data_array, uint32_t* len_array, uint32_t* start_offset_array, size_t num) override { + size_t new_size = 0; + for (size_t i = 0; i < num; i++) { + new_size += len_array[i] + 1; + } + + const size_t old_size = chars.size(); + chars.resize(old_size + new_size); + + Char* data = chars.data(); + size_t offset = old_size; for (size_t i = 0; i < num; i++) { uint32_t len = len_array[i]; uint32_t start_offset = start_offset_array[i]; - insert_data(data_array + start_offset, len); + if (len) memcpy(data + offset, data_array + start_offset, len); + data[offset + len] = 0; + offset += len + 1; + offsets.push_back(offset); } }; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org