[doris] branch master updated: [improvement]Add reading by rowids to speed up lazy materialization (#10506)

yiguolei Thu, 30 Jun 2022 06:03:53 -0700

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 18ad8ebfbb [improvement]Add reading by rowids to speed up lazy 
materialization (#10506)
18ad8ebfbb is described below

commit 18ad8ebfbbceda63412df30a76604f205dc9dc55
Author: Jerry Hu <mrh...@gmail.com>
AuthorDate: Thu Jun 30 21:03:41 2022 +0800

    [improvement]Add reading by rowids to speed up lazy materialization (#10506)
---
 be/src/olap/rowset/segment_v2/binary_dict_page.cpp | 32 ++++++++++
 be/src/olap/rowset/segment_v2/binary_dict_page.h   |  3 +
 be/src/olap/rowset/segment_v2/binary_plain_page.h  | 32 ++++++++++
 be/src/olap/rowset/segment_v2/bitshuffle_page.h    | 26 ++++++++
 be/src/olap/rowset/segment_v2/column_reader.cpp    | 74 ++++++++++++++++++++++
 be/src/olap/rowset/segment_v2/column_reader.h      |  8 +++
 be/src/olap/rowset/segment_v2/page_decoder.h       |  5 ++
 be/src/olap/rowset/segment_v2/rle_page.h           | 32 ++++++++++
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 42 +++---------
 be/src/vec/columns/column_string.h                 | 15 ++++-
 10 files changed, 235 insertions(+), 34 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp 
b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
index ea9fe66dfc..6fb1258604 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.cpp
@@ -261,6 +261,38 @@ Status BinaryDictPageDecoder::next_batch(size_t* n, 
vectorized::MutableColumnPtr
     return Status::OK();
 }
 
+Status BinaryDictPageDecoder::read_by_rowids(const rowid_t* rowids, ordinal_t 
page_first_ordinal,
+                                             size_t* n, 
vectorized::MutableColumnPtr& dst) {
+    if (_encoding_type == PLAIN_ENCODING) {
+        return _data_page_decoder->read_by_rowids(rowids, page_first_ordinal, 
n, dst);
+    }
+    DCHECK(_parsed);
+    DCHECK(_dict_decoder != nullptr) << "dict decoder pointer is nullptr";
+
+    if (PREDICT_FALSE(*n == 0)) {
+        *n = 0;
+        return Status::OK();
+    }
+
+    const auto* data_array = reinterpret_cast<const 
int32_t*>(_bit_shuffle_ptr->get_data(0));
+    auto total = *n;
+    size_t read_count = 0;
+    int32_t data[total];
+    for (size_t i = 0; i < total; ++i) {
+        ordinal_t ord = rowids[i] - page_first_ordinal;
+        if (PREDICT_FALSE(ord >= _bit_shuffle_ptr->_num_elements)) {
+            break;
+        }
+
+        data[read_count++] = data_array[ord];
+    }
+
+    if (LIKELY(read_count > 0))
+        dst->insert_many_dict_data(data, 0, _dict_word_info, read_count, 
_dict_decoder->_num_elems);
+    *n = read_count;
+    return Status::OK();
+}
+
 Status BinaryDictPageDecoder::next_batch(size_t* n, ColumnBlockView* dst) {
     if (_encoding_type == PLAIN_ENCODING) {
         return _data_page_decoder->next_batch(n, dst);
diff --git a/be/src/olap/rowset/segment_v2/binary_dict_page.h 
b/be/src/olap/rowset/segment_v2/binary_dict_page.h
index d57f7cf974..21890f2901 100644
--- a/be/src/olap/rowset/segment_v2/binary_dict_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_dict_page.h
@@ -108,6 +108,9 @@ public:
 
     Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst) override;
 
+    Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, 
size_t* n,
+                          vectorized::MutableColumnPtr& dst) override;
+
     size_t count() const override { return _data_page_decoder->count(); }
 
     size_t current_index() const override { return 
_data_page_decoder->current_index(); }
diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h 
b/be/src/olap/rowset/segment_v2/binary_plain_page.h
index 1919efcba8..e4531fe00a 100644
--- a/be/src/olap/rowset/segment_v2/binary_plain_page.h
+++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h
@@ -271,6 +271,38 @@ public:
         return Status::OK();
     };
 
+    Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, 
size_t* n,
+                          vectorized::MutableColumnPtr& dst) override {
+        DCHECK(_parsed);
+        if (PREDICT_FALSE(*n == 0)) {
+            *n = 0;
+            return Status::OK();
+        }
+
+        auto total = *n;
+        size_t read_count = 0;
+        uint32_t len_array[total];
+        uint32_t start_offset_array[total];
+        for (size_t i = 0; i < total; ++i) {
+            ordinal_t ord = rowids[i] - page_first_ordinal;
+            if (UNLIKELY(ord >= _num_elems)) {
+                break;
+            }
+
+            const uint32_t start_offset = offset(ord);
+            start_offset_array[read_count] = start_offset;
+            len_array[read_count] = offset(ord + 1) - start_offset;
+            read_count++;
+        }
+
+        if (LIKELY(read_count > 0))
+            dst->insert_many_binary_data(_data.mutable_data(), len_array, 
start_offset_array,
+                                         read_count);
+
+        *n = read_count;
+        return Status::OK();
+    }
+
     size_t count() const override {
         DCHECK(_parsed);
         return _num_elems;
diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_page.h 
b/be/src/olap/rowset/segment_v2/bitshuffle_page.h
index 234bda61b6..d0b22f663b 100644
--- a/be/src/olap/rowset/segment_v2/bitshuffle_page.h
+++ b/be/src/olap/rowset/segment_v2/bitshuffle_page.h
@@ -408,6 +408,32 @@ public:
         return Status::OK();
     };
 
+    Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, 
size_t* n,
+                          vectorized::MutableColumnPtr& dst) override {
+        DCHECK(_parsed);
+        if (PREDICT_FALSE(*n == 0)) {
+            *n = 0;
+            return Status::OK();
+        }
+
+        auto total = *n;
+        auto read_count = 0;
+        CppType data[total];
+        for (size_t i = 0; i < total; ++i) {
+            ordinal_t ord = rowids[i] - page_first_ordinal;
+            if (UNLIKELY(ord >= _num_elements)) {
+                break;
+            }
+
+            data[read_count++] = *reinterpret_cast<CppType*>(get_data(ord));
+        }
+
+        if (LIKELY(read_count > 0)) dst->insert_many_fix_len_data((const 
char*)data, read_count);
+
+        *n = read_count;
+        return Status::OK();
+    }
+
     Status peek_next_batch(size_t* n, ColumnBlockView* dst) override {
         return next_batch<false>(n, dst);
     }
diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index a7d8c99899..9087c7b246 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -704,6 +704,80 @@ Status FileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& d
     return Status::OK();
 }
 
+Status FileColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t 
count,
+                                          vectorized::MutableColumnPtr& dst) {
+    size_t remaining = count;
+    size_t total_read_count = 0;
+    size_t nrows_to_read = 0;
+    while (remaining > 0) {
+        RETURN_IF_ERROR(seek_to_ordinal(rowids[total_read_count]));
+
+        // number of rows to be read from this page
+        nrows_to_read = std::min(remaining, _page.remaining());
+
+        if (_page.has_null) {
+            size_t already_read = 0;
+            while ((nrows_to_read - already_read) > 0) {
+                bool is_null = false;
+                size_t this_run = std::min(nrows_to_read - already_read, 
_page.remaining());
+                if (UNLIKELY(this_run == 0)) {
+                    break;
+                }
+                this_run = _page.null_decoder.GetNextRun(&is_null, this_run);
+                size_t offset = total_read_count + already_read;
+                size_t this_read_count = 0;
+                rowid_t current_ordinal_in_page = _page.offset_in_page + 
_page.first_ordinal;
+                for (size_t i = 0; i < this_run; ++i) {
+                    if (rowids[offset + i] - current_ordinal_in_page >= 
this_run) {
+                        break;
+                    }
+                    this_read_count++;
+                }
+
+                auto origin_index = _page.data_decoder->current_index();
+                if (this_read_count > 0) {
+                    if (is_null) {
+                        auto* null_col =
+                                
vectorized::check_and_get_column<vectorized::ColumnNullable>(dst);
+                        if (UNLIKELY(null_col == nullptr)) {
+                            return Status::InternalError("unexpected column 
type in column reader");
+                        }
+
+                        
const_cast<vectorized::ColumnNullable*>(null_col)->insert_null_elements(
+                                this_read_count);
+                    } else {
+                        size_t read_count = this_read_count;
+
+                        // ordinal in nullable columns' data buffer maybe be 
not continuously(the data doesn't contain null value),
+                        // so we need use `page_start_off_in_decoder` to 
calculate the actual offset in `data_decoder`
+                        size_t page_start_off_in_decoder =
+                                _page.first_ordinal + _page.offset_in_page - 
origin_index;
+                        RETURN_IF_ERROR(_page.data_decoder->read_by_rowids(
+                                &rowids[offset], page_start_off_in_decoder, 
&read_count, dst));
+                        DCHECK_EQ(read_count, this_read_count);
+                    }
+                }
+
+                if (!is_null) 
_page.data_decoder->seek_to_position_in_page(origin_index + this_run);
+
+                already_read += this_read_count;
+                _page.offset_in_page += this_run;
+                DCHECK(_page.offset_in_page <= _page.num_rows);
+            }
+
+            nrows_to_read = already_read;
+            total_read_count += nrows_to_read;
+            remaining -= nrows_to_read;
+        } else {
+            _page.data_decoder->read_by_rowids(&rowids[total_read_count], 
_page.first_ordinal,
+                                               &nrows_to_read, dst);
+            total_read_count += nrows_to_read;
+            remaining -= nrows_to_read;
+        }
+    }
+    return Status::OK();
+}
+
 Status FileColumnIterator::_load_next_page(bool* eos) {
     _page_iter.next();
     if (!_page_iter.valid()) {
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index eb1536d69c..ed0303ca9b 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -247,6 +247,11 @@ public:
         return Status::NotSupported("not implement");
     }
 
+    virtual Status read_by_rowids(const rowid_t* rowids, const size_t count,
+                                  vectorized::MutableColumnPtr& dst) {
+        return Status::NotSupported("not implement");
+    }
+
     virtual ordinal_t get_current_ordinal() const = 0;
 
     virtual Status get_row_ranges_by_zone_map(CondColumn* cond_column, 
CondColumn* delete_condition,
@@ -283,6 +288,9 @@ public:
 
     Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool* 
has_null) override;
 
+    Status read_by_rowids(const rowid_t* rowids, const size_t count,
+                          vectorized::MutableColumnPtr& dst) override;
+
     ordinal_t get_current_ordinal() const override { return _current_ordinal; }
 
     // get row ranges by zone map
diff --git a/be/src/olap/rowset/segment_v2/page_decoder.h 
b/be/src/olap/rowset/segment_v2/page_decoder.h
index 3b0861f6b5..7c817a08ac 100644
--- a/be/src/olap/rowset/segment_v2/page_decoder.h
+++ b/be/src/olap/rowset/segment_v2/page_decoder.h
@@ -86,6 +86,11 @@ public:
         return Status::NotSupported("not implement vec op now");
     }
 
+    virtual Status read_by_rowids(const rowid_t* rowids, ordinal_t 
page_first_ordinal, size_t* n,
+                                  vectorized::MutableColumnPtr& dst) {
+        return Status::NotSupported("not implement vec op now");
+    }
+
     // Same as `next_batch` except for not moving forward the cursor.
     // When read array's ordinals in `ArrayFileColumnIterator`, we want to 
read one extra ordinal
     // but do not want to move forward the cursor.
diff --git a/be/src/olap/rowset/segment_v2/rle_page.h 
b/be/src/olap/rowset/segment_v2/rle_page.h
index c6eb413aa7..0cd4d3dd2a 100644
--- a/be/src/olap/rowset/segment_v2/rle_page.h
+++ b/be/src/olap/rowset/segment_v2/rle_page.h
@@ -254,6 +254,38 @@ public:
         return Status::OK();
     };
 
+    Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, 
size_t* n,
+                          vectorized::MutableColumnPtr& dst) override {
+        DCHECK(_parsed);
+        if (PREDICT_FALSE(*n == 0 || _cur_index >= _num_elements)) {
+            *n = 0;
+            return Status::OK();
+        }
+
+        auto total = *n;
+        bool result = false;
+        size_t read_count = 0;
+        CppType value;
+        for (size_t i = 0; i < total; ++i) {
+            ordinal_t ord = rowids[i] - page_first_ordinal;
+            if (UNLIKELY(ord >= _num_elements)) {
+                *n = read_count;
+                return Status::OK();
+            }
+
+            _rle_decoder.Skip(ord - _cur_index);
+            _cur_index = ord;
+
+            result = _rle_decoder.Get(&value);
+            _cur_index++;
+            DCHECK(result);
+            dst->insert_data((char*)(&value), SIZE_OF_TYPE);
+            read_count++;
+        }
+        *n = read_count;
+        return Status::OK();
+    }
+
     size_t count() const override { return _num_elements; }
 
     size_t current_index() const override { return _cur_index; }
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 61adede9f2..d00d1856bd 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -699,28 +699,12 @@ void SegmentIterator::_vec_init_lazy_materialization() {
 
     // Step 2: check non-predicate read costs to determine whether need lazy 
materialization
     // fill _non_predicate_columns.
-    // note(wb) For block schema, query layer and storage layer may have some 
diff
-    //   query layer block schema not contains delete column, but storage 
layer appends delete column to end of block schema
-    //   When output block to query layer, delete column can be skipped.
-    //  _schema.column_ids() stands for storage layer block schema, so it 
contains delete columnid
-    //  we just regard delete column as common pred column here.
+    // After some optimization, we suppose lazy materialization is better 
performance.
     if (_schema.column_ids().size() > pred_column_ids.size()) {
         for (auto cid : _schema.column_ids()) {
             if (!_is_pred_column[cid]) {
                 _non_predicate_columns.push_back(cid);
-                FieldType type = _schema.column(cid)->type();
-
-                // todo(wb) maybe we can make read char type faster
-                // todo(wb) support map/array type
-                // todo(wb) consider multiple integer columns cost, such as 
1000 columns, maybe lazy materialization faster
-                if (!_lazy_materialization_read &&
-                    (_is_need_vec_eval ||
-                     _is_need_short_eval) && // only when pred exists, we need 
to consider lazy materialization
-                    (type == OLAP_FIELD_TYPE_HLL || type == 
OLAP_FIELD_TYPE_OBJECT ||
-                     type == OLAP_FIELD_TYPE_VARCHAR || type == 
OLAP_FIELD_TYPE_CHAR ||
-                     type == OLAP_FIELD_TYPE_STRING || type == 
OLAP_FIELD_TYPE_BOOL ||
-                     type == OLAP_FIELD_TYPE_DATE || type == 
OLAP_FIELD_TYPE_DATETIME ||
-                     type == OLAP_FIELD_TYPE_DECIMAL)) {
+                if (_is_need_vec_eval || _is_need_short_eval) {
                     _lazy_materialization_read = true;
                 }
             }
@@ -971,21 +955,13 @@ void 
SegmentIterator::_read_columns_by_rowids(std::vector<ColumnId>& read_column
                                               uint16_t* sel_rowid_idx, size_t 
select_size,
                                               vectorized::MutableColumns* 
mutable_columns) {
     SCOPED_RAW_TIMER(&_opts.stats->lazy_read_ns);
-    size_t start_idx = 0;
-    while (start_idx < select_size) {
-        size_t end_idx = start_idx + 1;
-        while (end_idx < select_size && (rowid_vector[sel_rowid_idx[end_idx - 
1]] ==
-                                         rowid_vector[sel_rowid_idx[end_idx]] 
- 1)) {
-            end_idx++;
-        }
-        size_t range = end_idx - start_idx;
-        {
-            _opts.stats->block_lazy_read_seek_num += 1;
-            SCOPED_RAW_TIMER(&_opts.stats->block_lazy_read_seek_ns);
-            _seek_columns(read_column_ids, 
rowid_vector[sel_rowid_idx[start_idx]]);
-        }
-        _read_columns(read_column_ids, *mutable_columns, range);
-        start_idx += range;
+    std::vector<rowid_t> rowids(select_size);
+    for (size_t i = 0; i < select_size; ++i) {
+        rowids[i] = rowid_vector[sel_rowid_idx[i]];
+    }
+    for (auto cid : read_column_ids) {
+        auto& column = (*mutable_columns)[cid];
+        _column_iterators[cid]->read_by_rowids(rowids.data(), select_size, 
column);
     }
 }
 
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index 421255a77b..9b92890ed4 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -160,10 +160,23 @@ public:
 
     void insert_many_binary_data(char* data_array, uint32_t* len_array,
                                  uint32_t* start_offset_array, size_t num) 
override {
+        size_t new_size = 0;
+        for (size_t i = 0; i < num; i++) {
+            new_size += len_array[i] + 1;
+        }
+
+        const size_t old_size = chars.size();
+        chars.resize(old_size + new_size);
+
+        Char* data = chars.data();
+        size_t offset = old_size;
         for (size_t i = 0; i < num; i++) {
             uint32_t len = len_array[i];
             uint32_t start_offset = start_offset_array[i];
-            insert_data(data_array + start_offset, len);
+            if (len) memcpy(data + offset, data_array + start_offset, len);
+            data[offset + len] = 0;
+            offset += len + 1;
+            offsets.push_back(offset);
         }
     };
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[doris] branch master updated: [improvement]Add reading by rowids to speed up lazy materialization (#10506)

Reply via email to