This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push: new 983cdc7b0d [feature-wip](array-type) Support loading data in vectorized format (#10065) 983cdc7b0d is described below commit 983cdc7b0dec2a7a2838ee364633725cd2117a31 Author: Adonis Ling <adonis0...@gmail.com> AuthorDate: Wed Jun 15 14:40:28 2022 +0800 [feature-wip](array-type) Support loading data in vectorized format (#10065) --- be/src/olap/rowset/segment_v2/column_reader.cpp | 45 +++++++++++++++++++++++-- be/src/olap/rowset/segment_v2/column_reader.h | 2 ++ be/test/runtime/array_test.cpp | 42 ++++++++++++++++++++++- 3 files changed, 86 insertions(+), 3 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 88eb1bfc3e..d68ffc66c0 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -17,7 +17,6 @@ #include "olap/rowset/segment_v2/column_reader.h" -#include "common/logging.h" #include "gutil/strings/substitute.h" // for Substitute #include "olap/column_block.h" // for ColumnBlockView #include "olap/rowset/segment_v2/binary_dict_page.h" // for BinaryDictPageDecoder @@ -28,8 +27,9 @@ #include "olap/rowset/segment_v2/page_pointer.h" // for PagePointer #include "olap/types.h" // for TypeInfo #include "util/block_compression.h" -#include "util/coding.h" // for get_varint32 #include "util/rle_encoding.h" // for RleDecoder +#include "vec/columns/column.h" +#include "vec/columns/column_array.h" #include "vec/core/types.h" #include "vec/runtime/vdatetime_value.h" //for VecDateTime @@ -462,6 +462,47 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, ColumnBlockView* dst, bool return Status::OK(); } +Status ArrayFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst, + bool* has_null) { + const auto* column_array = vectorized::check_and_get_column<vectorized::ColumnArray>( + dst->is_nullable() ? static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column() + : *dst); + + bool offsets_has_null = false; + auto column_offsets_ptr = column_array->get_offsets_column().assume_mutable(); + ssize_t start = column_offsets_ptr->size(); + RETURN_IF_ERROR(_length_iterator->next_batch(n, column_offsets_ptr, &offsets_has_null)); + if (*n == 0) { + return Status::OK(); + } + auto& column_offsets = + static_cast<vectorized::ColumnArray::ColumnOffsets&>(*column_offsets_ptr); + auto& offsets_data = column_offsets.get_data(); + for (ssize_t i = start; i < offsets_data.size(); ++i) { + offsets_data[i] += offsets_data[i - 1]; // -1 is ok + } + + auto column_items_ptr = column_array->get_data().assume_mutable(); + size_t num_items = offsets_data.back() - offsets_data[start - 1]; + if (num_items > 0) { + size_t num_read = num_items; + bool items_has_null = false; + RETURN_IF_ERROR(_item_iterator->next_batch(&num_read, column_items_ptr, &items_has_null)); + DCHECK(num_read == num_items); + } + + if (dst->is_nullable()) { + auto null_map_ptr = + static_cast<vectorized::ColumnNullable&>(*dst).get_null_map_column_ptr(); + size_t num_read = *n; + bool null_signs_has_null = false; + RETURN_IF_ERROR(_null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + DCHECK(num_read == *n); + } + + return Status::OK(); +} + //////////////////////////////////////////////////////////////////////////////// FileColumnIterator::FileColumnIterator(ColumnReader* reader) : _reader(reader) {} diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index de51b01824..47250a96cb 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -335,6 +335,8 @@ public: Status next_batch(size_t* n, ColumnBlockView* dst, bool* has_null) override; + Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool* has_null) override; + Status seek_to_first() override { RETURN_IF_ERROR(_length_iterator->seek_to_first()); RETURN_IF_ERROR(_item_iterator->seek_to_first()); // lazy??? diff --git a/be/test/runtime/array_test.cpp b/be/test/runtime/array_test.cpp index 162157852d..9ef5277352 100644 --- a/be/test/runtime/array_test.cpp +++ b/be/test/runtime/array_test.cpp @@ -45,7 +45,10 @@ #include "testutil/desc_tbl_builder.h" #include "util/file_utils.h" #include "util/uid_util.h" +#include "vec/columns/column.h" +#include "vec/columns/column_array.h" #include "vec/core/block.h" +#include "vec/data_types/data_type_factory.hpp" namespace doris { @@ -289,10 +292,12 @@ private: auto col = block.column_block(0); int index = 0; size_t rows_read = 1024; + size_t num_rows = 0; do { ColumnBlockView dst(&col); st = iter->next_batch(&rows_read, &dst); EXPECT_TRUE(st.ok()); + num_rows += rows_read; for (int i = 0; i < rows_read; ++i) { validate(field, arrays[index++], reinterpret_cast<const CollectionValue*>(col.cell_ptr(i))); @@ -301,9 +306,44 @@ private: } while (rows_read >= 1024); auto type_info = get_type_info(column_pb); auto tuple_desc = get_tuple_descriptor(_object_pool, type_info.get()); - block.set_selected_size(rows_read); + block.set_selected_size(num_rows); test_convert_to_vec_block(block, tuple_desc, field, arrays); } + { + auto type_info = get_type_info(column_pb); + auto tuple_desc = get_tuple_descriptor(_object_pool, type_info.get()); + + auto reader = create_column_reader(path, meta, arrays.size()); + EXPECT_NE(reader, nullptr); + auto rblock = create_readable_block(path); + EXPECT_NE(rblock, nullptr); + OlapReaderStatistics stats; + std::unique_ptr<segment_v2::ColumnIterator> iter( + new_iterator(rblock.get(), &stats, reader.get())); + EXPECT_NE(iter, nullptr); + auto st = iter->seek_to_first(); + EXPECT_TRUE(st.ok()) << st.to_string(); + + auto data_type = + vectorized::DataTypeFactory::instance().create_data_type(tablet_column); + auto column_ptr = data_type->create_column(); + size_t rows_read = 1024; + column_ptr->reserve(rows_read); + do { + bool has_null = false; + st = iter->next_batch(&rows_read, column_ptr, &has_null); + EXPECT_TRUE(st.ok()); + vectorized::Block vblock; + vblock.insert({const_cast<const vectorized::IColumn&>(*column_ptr).get_ptr(), + data_type, ""}); + for (int i = 0; i < arrays.size(); ++i) { + auto tuple = vblock.deep_copy_tuple(*tuple_desc, _mem_pool.get(), i, 0, false); + auto actual = + tuple->get_collection_slot(tuple_desc->slots().front()->tuple_offset()); + validate(field, arrays[i], actual); + } + } while (rows_read >= 1024); + } } template <segment_v2::EncodingTypePB array_encoding, segment_v2::EncodingTypePB item_encoding> --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org