This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 983cdc7b0d [feature-wip](array-type) Support loading data in 
vectorized format (#10065)
983cdc7b0d is described below

commit 983cdc7b0dec2a7a2838ee364633725cd2117a31
Author: Adonis Ling <adonis0...@gmail.com>
AuthorDate: Wed Jun 15 14:40:28 2022 +0800

    [feature-wip](array-type) Support loading data in vectorized format (#10065)
---
 be/src/olap/rowset/segment_v2/column_reader.cpp | 45 +++++++++++++++++++++++--
 be/src/olap/rowset/segment_v2/column_reader.h   |  2 ++
 be/test/runtime/array_test.cpp                  | 42 ++++++++++++++++++++++-
 3 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp 
b/be/src/olap/rowset/segment_v2/column_reader.cpp
index 88eb1bfc3e..d68ffc66c0 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/column_reader.cpp
@@ -17,7 +17,6 @@
 
 #include "olap/rowset/segment_v2/column_reader.h"
 
-#include "common/logging.h"
 #include "gutil/strings/substitute.h"                // for Substitute
 #include "olap/column_block.h"                       // for ColumnBlockView
 #include "olap/rowset/segment_v2/binary_dict_page.h" // for 
BinaryDictPageDecoder
@@ -28,8 +27,9 @@
 #include "olap/rowset/segment_v2/page_pointer.h" // for PagePointer
 #include "olap/types.h"                          // for TypeInfo
 #include "util/block_compression.h"
-#include "util/coding.h"       // for get_varint32
 #include "util/rle_encoding.h" // for RleDecoder
+#include "vec/columns/column.h"
+#include "vec/columns/column_array.h"
 #include "vec/core/types.h"
 #include "vec/runtime/vdatetime_value.h" //for VecDateTime
 
@@ -462,6 +462,47 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, 
ColumnBlockView* dst, bool
     return Status::OK();
 }
 
+Status ArrayFileColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst,
+                                           bool* has_null) {
+    const auto* column_array = 
vectorized::check_and_get_column<vectorized::ColumnArray>(
+            dst->is_nullable() ? 
static_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()
+                               : *dst);
+
+    bool offsets_has_null = false;
+    auto column_offsets_ptr = 
column_array->get_offsets_column().assume_mutable();
+    ssize_t start = column_offsets_ptr->size();
+    RETURN_IF_ERROR(_length_iterator->next_batch(n, column_offsets_ptr, 
&offsets_has_null));
+    if (*n == 0) {
+        return Status::OK();
+    }
+    auto& column_offsets =
+            
static_cast<vectorized::ColumnArray::ColumnOffsets&>(*column_offsets_ptr);
+    auto& offsets_data = column_offsets.get_data();
+    for (ssize_t i = start; i < offsets_data.size(); ++i) {
+        offsets_data[i] += offsets_data[i - 1]; // -1 is ok
+    }
+
+    auto column_items_ptr = column_array->get_data().assume_mutable();
+    size_t num_items = offsets_data.back() - offsets_data[start - 1];
+    if (num_items > 0) {
+        size_t num_read = num_items;
+        bool items_has_null = false;
+        RETURN_IF_ERROR(_item_iterator->next_batch(&num_read, 
column_items_ptr, &items_has_null));
+        DCHECK(num_read == num_items);
+    }
+
+    if (dst->is_nullable()) {
+        auto null_map_ptr =
+                
static_cast<vectorized::ColumnNullable&>(*dst).get_null_map_column_ptr();
+        size_t num_read = *n;
+        bool null_signs_has_null = false;
+        RETURN_IF_ERROR(_null_iterator->next_batch(&num_read, null_map_ptr, 
&null_signs_has_null));
+        DCHECK(num_read == *n);
+    }
+
+    return Status::OK();
+}
+
 
////////////////////////////////////////////////////////////////////////////////
 
 FileColumnIterator::FileColumnIterator(ColumnReader* reader) : _reader(reader) 
{}
diff --git a/be/src/olap/rowset/segment_v2/column_reader.h 
b/be/src/olap/rowset/segment_v2/column_reader.h
index de51b01824..47250a96cb 100644
--- a/be/src/olap/rowset/segment_v2/column_reader.h
+++ b/be/src/olap/rowset/segment_v2/column_reader.h
@@ -335,6 +335,8 @@ public:
 
     Status next_batch(size_t* n, ColumnBlockView* dst, bool* has_null) 
override;
 
+    Status next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool* 
has_null) override;
+
     Status seek_to_first() override {
         RETURN_IF_ERROR(_length_iterator->seek_to_first());
         RETURN_IF_ERROR(_item_iterator->seek_to_first()); // lazy???
diff --git a/be/test/runtime/array_test.cpp b/be/test/runtime/array_test.cpp
index 162157852d..9ef5277352 100644
--- a/be/test/runtime/array_test.cpp
+++ b/be/test/runtime/array_test.cpp
@@ -45,7 +45,10 @@
 #include "testutil/desc_tbl_builder.h"
 #include "util/file_utils.h"
 #include "util/uid_util.h"
+#include "vec/columns/column.h"
+#include "vec/columns/column_array.h"
 #include "vec/core/block.h"
+#include "vec/data_types/data_type_factory.hpp"
 
 namespace doris {
 
@@ -289,10 +292,12 @@ private:
             auto col = block.column_block(0);
             int index = 0;
             size_t rows_read = 1024;
+            size_t num_rows = 0;
             do {
                 ColumnBlockView dst(&col);
                 st = iter->next_batch(&rows_read, &dst);
                 EXPECT_TRUE(st.ok());
+                num_rows += rows_read;
                 for (int i = 0; i < rows_read; ++i) {
                     validate(field, arrays[index++],
                              reinterpret_cast<const 
CollectionValue*>(col.cell_ptr(i)));
@@ -301,9 +306,44 @@ private:
             } while (rows_read >= 1024);
             auto type_info = get_type_info(column_pb);
             auto tuple_desc = get_tuple_descriptor(_object_pool, 
type_info.get());
-            block.set_selected_size(rows_read);
+            block.set_selected_size(num_rows);
             test_convert_to_vec_block(block, tuple_desc, field, arrays);
         }
+        {
+            auto type_info = get_type_info(column_pb);
+            auto tuple_desc = get_tuple_descriptor(_object_pool, 
type_info.get());
+
+            auto reader = create_column_reader(path, meta, arrays.size());
+            EXPECT_NE(reader, nullptr);
+            auto rblock = create_readable_block(path);
+            EXPECT_NE(rblock, nullptr);
+            OlapReaderStatistics stats;
+            std::unique_ptr<segment_v2::ColumnIterator> iter(
+                    new_iterator(rblock.get(), &stats, reader.get()));
+            EXPECT_NE(iter, nullptr);
+            auto st = iter->seek_to_first();
+            EXPECT_TRUE(st.ok()) << st.to_string();
+
+            auto data_type =
+                    
vectorized::DataTypeFactory::instance().create_data_type(tablet_column);
+            auto column_ptr = data_type->create_column();
+            size_t rows_read = 1024;
+            column_ptr->reserve(rows_read);
+            do {
+                bool has_null = false;
+                st = iter->next_batch(&rows_read, column_ptr, &has_null);
+                EXPECT_TRUE(st.ok());
+                vectorized::Block vblock;
+                vblock.insert({const_cast<const 
vectorized::IColumn&>(*column_ptr).get_ptr(),
+                               data_type, ""});
+                for (int i = 0; i < arrays.size(); ++i) {
+                    auto tuple = vblock.deep_copy_tuple(*tuple_desc, 
_mem_pool.get(), i, 0, false);
+                    auto actual =
+                            
tuple->get_collection_slot(tuple_desc->slots().front()->tuple_offset());
+                    validate(field, arrays[i], actual);
+                }
+            } while (rows_read >= 1024);
+        }
     }
 
     template <segment_v2::EncodingTypePB array_encoding, 
segment_v2::EncodingTypePB item_encoding>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to