Re: [PR] Var nested [doris]

via GitHub Sat, 17 Aug 2024 20:25:55 -0700


xiaokang commented on code in PR #39022:
URL: https://github.com/apache/doris/pull/39022#discussion_r1718142693



##########
be/src/olap/rowset/segment_v2/segment.h:
##########
@@ -162,10 +162,10 @@ class Segment : public 
std::enable_shared_from_this<Segment> {
     // nullptr will returned if storage type does not contains such column
     std::shared_ptr<const vectorized::IDataType> 
get_data_type_of(vectorized::PathInDataPtr path,
                                                                   bool 
is_nullable,
-                                                                  bool 
ignore_children) const;

Review Comment:
   Is it safe to delete ignore_children?



##########
be/src/vec/data_types/data_type_jsonb.h:
##########
@@ -68,7 +68,7 @@ class DataTypeJsonb final : public IDataType {
     MutableColumnPtr create_column() const override;
 
     virtual Field get_default() const override {
-        std::string default_json = "{}";
+        std::string default_json = "null";

Review Comment:
   Why change default value to null ?



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -415,11 +456,77 @@ void ColumnObject::Subcolumn::insert(Field field, 
FieldInfo info) {
     data.back()->insert(field);
 }
 
-void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn& src, size_t 
start, size_t length) {
+static DataTypePtr create_array(TypeIndex type, size_t num_dimensions) {
+    DataTypePtr result_type = 
make_nullable(DataTypeFactory::instance().create_data_type(type));
+    for (size_t i = 0; i < num_dimensions; ++i) {
+        result_type = 
make_nullable(std::make_shared<DataTypeArray>(result_type));
+    }
+    return result_type;
+}
+
+Array create_empty_array_field(size_t num_dimensions) {
+    if (num_dimensions == 0) {
+        throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+                               "Cannot create array field with 0 dimensions");
+    }
+
+    Array array;
+    Array* current_array = &array;
+    for (size_t i = 1; i < num_dimensions; ++i) {
+        current_array->push_back(Array());
+        current_array = &current_array->back().get<Array&>();
+    }
+
+    return array;
+}
+
+// Recreates column with default scalar values and keeps sizes of arrays.
+static ColumnPtr recreate_column_with_default_values(const ColumnPtr& column, 
TypeIndex scalar_type,
+                                                     size_t num_dimensions) {
+    const auto* column_array = 
check_and_get_column<ColumnArray>(remove_nullable(column).get());
+    if (column_array && num_dimensions) {
+        return make_nullable(ColumnArray::create(
+                
recreate_column_with_default_values(column_array->get_data_ptr(), scalar_type,
+                                                    num_dimensions - 1),
+                IColumn::mutate(column_array->get_offsets_ptr())));
+    }
+
+    return create_array(scalar_type, num_dimensions)
+            ->create_column()
+            ->clone_resized(column->size());
+}
+
+ColumnObject::Subcolumn ColumnObject::Subcolumn::recreate_with_default_values(

Review Comment:
   suggest a better name: clone_with_default_values



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -415,11 +456,77 @@ void ColumnObject::Subcolumn::insert(Field field, 
FieldInfo info) {
     data.back()->insert(field);
 }
 
-void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn& src, size_t 
start, size_t length) {
+static DataTypePtr create_array(TypeIndex type, size_t num_dimensions) {
+    DataTypePtr result_type = 
make_nullable(DataTypeFactory::instance().create_data_type(type));
+    for (size_t i = 0; i < num_dimensions; ++i) {
+        result_type = 
make_nullable(std::make_shared<DataTypeArray>(result_type));
+    }
+    return result_type;
+}
+
+Array create_empty_array_field(size_t num_dimensions) {
+    if (num_dimensions == 0) {
+        throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+                               "Cannot create array field with 0 dimensions");
+    }
+
+    Array array;
+    Array* current_array = &array;
+    for (size_t i = 1; i < num_dimensions; ++i) {
+        current_array->push_back(Array());
+        current_array = &current_array->back().get<Array&>();
+    }
+
+    return array;
+}
+
+// Recreates column with default scalar values and keeps sizes of arrays.
+static ColumnPtr recreate_column_with_default_values(const ColumnPtr& column, 
TypeIndex scalar_type,
+                                                     size_t num_dimensions) {
+    const auto* column_array = 
check_and_get_column<ColumnArray>(remove_nullable(column).get());
+    if (column_array && num_dimensions) {

Review Comment:
   Do you assume column_array's dimension is equal to num_dimensions? If they 
are not the same, does the function return expected result?



##########
be/src/olap/rowset/segment_v2/column_reader.cpp:
##########
@@ -1663,4 +1663,58 @@ Status VariantRootColumnIterator::read_by_rowids(const 
rowid_t* rowids, const si
     return Status::OK();
 }
 
+Status DefaultNestedColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst) {
+    bool has_null = false;
+    return next_batch(n, dst, &has_null);
+}
+
+static void fill_nested_with_defaults(vectorized::MutableColumnPtr& dst,
+                                      vectorized::MutableColumnPtr& 
sibling_column, size_t nrows) {
+    const auto* sibling_array = 
vectorized::check_and_get_column<vectorized::ColumnArray>(
+            remove_nullable(sibling_column->get_ptr()));
+    CHECK(sibling_array) << "Expected array column, but mmet " << 
sibling_column->get_name();
+    ;
+    const auto* dst_array = 
vectorized::check_and_get_column<vectorized::ColumnArray>(
+            remove_nullable(dst->get_ptr()));
+    if (!dst_array || !sibling_array) {
+        throw doris::Exception(ErrorCode::INTERNAL_ERROR,
+                               "Expected array column, but met %s and %s", 
dst->get_name(),
+                               sibling_column->get_name());
+    }
+    auto new_nested =
+            
dst_array->get_data_ptr()->clone_resized(sibling_array->get_data_ptr()->size());
+    auto new_array = make_nullable(vectorized::ColumnArray::create(
+            new_nested->assume_mutable(), 
sibling_array->get_offsets_ptr()->assume_mutable()));
+    dst->insert_range_from(*new_array, 0, new_array->size());

Review Comment:
   Where is the logic for default value.



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.h:
##########
@@ -123,35 +113,97 @@ class HierarchicalDataReader : public ColumnIterator {
         }));
 
         // build variant as container
-        auto container = vectorized::ColumnObject::create(true, false);
-        auto& container_variant = 
assert_cast<vectorized::ColumnObject&>(*container);
+        auto container = ColumnObject::create(true, false);
+        auto& container_variant = assert_cast<ColumnObject&>(*container);
 
         // add root first
-        if (_path.get_parts().size() == 1) {
-            auto& root_var =
-                    _root_reader->column->is_nullable()
-                            ? assert_cast<vectorized::ColumnObject&>(
-                                      assert_cast<vectorized::ColumnNullable&>(
-                                              *_root_reader->column)
-                                              .get_nested_column())
-                            : 
assert_cast<vectorized::ColumnObject&>(*_root_reader->column);
+        if (_path.get_parts().size() == 1 && _root_reader) {
+            auto& root_var = _root_reader->column->is_nullable()
+                                     ? assert_cast<ColumnObject&>(
+                                               
assert_cast<ColumnNullable&>(*_root_reader->column)
+                                                       .get_nested_column())
+                                     : 
assert_cast<ColumnObject&>(*_root_reader->column);
             auto column = root_var.get_root();
             auto type = root_var.get_root_type();
             container_variant.add_sub_column({}, std::move(column), type);
         }
-
+        // parent path -> subcolumns
+        std::map<PathInData, PathsWithColumnAndType> nested_subcolumns;
+        PathsWithColumnAndType non_nested_subcolumns;
         RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
-            vectorized::MutableColumnPtr column = node.data.column->get_ptr();
-            bool add = container_variant.add_sub_column(
-                    node.path.copy_pop_nfront(_path.get_parts().size()), 
std::move(column),
-                    node.data.type);
-            if (!add) {
-                return Status::InternalError("Duplicated {}, type {}", 
node.path.get_path(),
-                                             node.data.type->get_name());
+            MutableColumnPtr column = node.data.column->get_ptr();
+            PathInData real_path = 
node.path.copy_pop_nfront(_path.get_parts().size());

Review Comment:
   Is real_path a relative path from _path? If yes, use a name relative_path.



##########
be/src/olap/rowset/segment_creator.cpp:
##########
@@ -99,9 +100,10 @@ Status 
SegmentFlusher::_parse_variant_columns(vectorized::Block& block) {
         return Status::OK();
     }
 
-    vectorized::schema_util::ParseContext ctx;
-    ctx.record_raw_json_column = 
_context.tablet_schema->has_row_store_for_all_columns();

Review Comment:
   Is record_raw_json_column not used any more?



##########
be/src/vec/columns/column_object.h:
##########
@@ -125,27 +131,33 @@ class ColumnObject final : public COWHelper<IColumn, 
ColumnObject> {
 
         void get(size_t n, Field& res) const;
 
-        /// Checks the consistency of column's parts stored in @data.
-        void checkTypes() const;
-
         /// Inserts a field, which scalars can be arbitrary, but number of
         /// dimensions should be consistent with current common type.
         /// throws InvalidArgument when meet conflict types
         void insert(Field field);
 
         void insert(Field field, FieldInfo info);
 
-        void insertDefault();
+        void insert_default();
+
+        void insert_many_defaults(size_t length);
 
-        void insertManyDefaults(size_t length);
+        void insert_range_from(const Subcolumn& src, size_t start, size_t 
length);
 
-        void insertRangeFrom(const Subcolumn& src, size_t start, size_t 
length);
+        /// Recreates subcolumn with default scalar values and keeps sizes of 
arrays.
+        /// Used to create columns of type Nested with consistent array sizes.
+        Subcolumn recreate_with_default_values(const FieldInfo& field_info) 
const;
 
         void pop_back(size_t n);
 
+        Subcolumn cut(size_t start, size_t length) const;

Review Comment:
   add comment



##########
be/src/vec/columns/column_object.h:
##########
@@ -94,6 +95,11 @@ class ColumnObject final : public COWHelper<IColumn, 
ColumnObject> {
     // Using jsonb type as most common type, since it's adopted all types of 
json
     using MostCommonType = DataTypeJsonb;
     constexpr static TypeIndex MOST_COMMON_TYPE_ID = TypeIndex::JSONB;
+    // Nullable(Array(Nullable(Object)))
+    const static DataTypePtr NESTED_TYPE;
+    // Finlize mode for subcolumns, write mode will deal with sparse columns, 
only affects in flush block to segments.

Review Comment:
   add more explaination for 'write mode will deal with sparse columns'



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -884,8 +996,62 @@ void ColumnObject::get(size_t n, Field& res) const {
     auto& object = res.get<VariantMap&>();
 
     for (const auto& entry : subcolumns) {
-        auto it = object.try_emplace(entry->path.get_path()).first;
-        entry->data.get(n, it->second);
+        Field field;
+        entry->data.get(n, field);
+        // Notice: we treat null as empty field
+        if (field.get_type() != Field::Types::Null) {
+            object.try_emplace(entry->path.get_path(), field);
+        }
+    }
+    if (object.empty()) {
+        res = Null();
+    }
+}
+
+void ColumnObject::add_nested_subcolumn(const PathInData& key, const 
FieldInfo& field_info,
+                                        size_t new_size) {
+    if (!key.has_nested_part()) {
+        throw doris::Exception(ErrorCode::INTERNAL_ERROR,
+                               "Cannot add Nested subcolumn, because path 
doesn't contain Nested");
+    }
+
+    bool inserted = false;
+    /// We find node that represents the same Nested type as @key.
+    const auto* nested_node = subcolumns.find_best_match(key);

Review Comment:
   find_best_match may find a prefix node. Why not use find? I wonder it may 
cause problem.



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -884,8 +996,62 @@ void ColumnObject::get(size_t n, Field& res) const {
     auto& object = res.get<VariantMap&>();
 
     for (const auto& entry : subcolumns) {
-        auto it = object.try_emplace(entry->path.get_path()).first;
-        entry->data.get(n, it->second);
+        Field field;
+        entry->data.get(n, field);
+        // Notice: we treat null as empty field

Review Comment:
   add comment for why



##########
be/src/vec/common/schema_util.cpp:
##########
@@ -212,7 +220,6 @@ void get_column_by_type(const vectorized::DataTypePtr& 
data_type, const std::str
                            "", child, {});
         
column.set_length(TabletColumn::get_field_length_by_type(TPrimitiveType::ARRAY, 
0));
         column.add_sub_column(child);
-        column.set_default_value("[]");

Review Comment:
   Why delete it?



##########
be/src/vec/json/path_in_data.cpp:
##########
@@ -118,12 +125,13 @@ void PathInData::build_parts(const Parts& other_parts) {
 void PathInData::from_protobuf(const segment_v2::ColumnPathInfo& pb) {
     parts.clear();
     path = pb.path();
-    has_nested = pb.has_has_nested();
+    has_nested = false;

Review Comment:
   why change it?



##########
be/src/vec/functions/function_cast.h:
##########
@@ -810,6 +810,20 @@ struct ConvertImplGenericToJsonb {
     }
 };
 
+struct ConvertNothingToJsonb {
+    static Status execute(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
+                          const size_t result, size_t input_rows_count) {
+        const auto& col_with_type_and_name = 
block.get_by_position(arguments[0]);
+        const IColumn& col_from = *col_with_type_and_name.column;
+        size_t size = col_from.size();
+        auto col_to = col_from.clone_resized(size);

Review Comment:
   col_to's type is Nothing but should be JSONB



##########
be/src/vec/data_types/convert_field_to_type.cpp:
##########
@@ -98,6 +98,10 @@ class FieldVisitorToStringSimple : public 
StaticVisitor<String> {
         LOG(FATAL) << "not implemeted";
         __builtin_unreachable();
     }
+    [[noreturn]] String operator()(const VariantMap& x) const {
+        LOG(FATAL) << "not implemeted";

Review Comment:
   Why so many LOG FATAL?



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -1132,6 +1298,44 @@ rapidjson::Value* 
find_leaf_node_by_path(rapidjson::Value& json, const PathInDat
     return find_leaf_node_by_path(current, path, idx + 1);
 }
 
+// skip empty json:
+// 1. null value as empty json
+// 2. nested array with only nulls, eg. [null. null]

Review Comment:
   It maybe all null in array in original data.



##########
be/src/vec/columns/subcolumn_tree.h:
##########
@@ -232,6 +219,50 @@ class SubcolumnsTree {
         return nullptr;
     }
 
+    /// Find leaf by path.
+    const Node* find_leaf(const PathInData& path) const {
+        const auto* candidate = find_exact(path);
+        if (!candidate || !candidate->is_scalar()) {
+            return nullptr;
+        }
+        return candidate;
+    }
+
+    const Node* get_leaf_of_the_same_nested(const PathInData& path,
+                                            const NodePredicate& pred) const {
+        if (!path.has_nested_part()) {
+            return nullptr;
+        }
+
+        const auto* current_node = find_leaf(path);
+        const Node* leaf = nullptr;
+
+        while (current_node) {
+            /// Try to find the first Nested up to the current node.
+            const auto* node_nested = find_parent(current_node, [](const auto& 
candidate) -> bool {
+                return candidate.is_nested();
+            });
+
+            if (!node_nested) {
+                break;
+            }
+
+            /// Find the leaf with subcolumn that contains values
+            /// for the last rows.
+            /// If there are no leaves, skip current node and find
+            /// the next node up to the current.
+            leaf = SubcolumnsTree<NodeData>::find_leaf(node_nested, pred);
+
+            if (leaf) {
+                break;
+            }
+
+            current_node = node_nested->parent;

Review Comment:
   It will try other anstor nested parent, so it's not always 
`get_leaf_of_the_same_nested`.



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -384,9 +425,9 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo 
info) {
     }
     if (data.empty()) {
         add_new_column_part(create_array_of_type(base_type.idx, value_dim, 
is_nullable));
-    } else if (least_common_type.get_type_id() != base_type.idx && 
!base_type.is_nothing()) {
-        if (schema_util::is_conversion_required_between_integers(base_type.idx,
-                                                                 
least_common_type.get_type_id())) {
+    } else if (least_common_type.get_base_type_id() != base_type.idx && 
!base_type.is_nothing()) {

Review Comment:
   ?



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -1381,34 +1585,72 @@ Status ColumnObject::merge_sparse_to_root_column() {
     return Status::OK();
 }
 
-void ColumnObject::finalize_if_not() {
-    if (!is_finalized()) {
-        finalize();
-    }
-}
-
-void ColumnObject::finalize(bool ignore_sparse) {
+void ColumnObject::unnest(Subcolumns::NodePtr& entry, Subcolumns& subcolumns) 
const {
+    entry->data.finalize();
+    auto nested_column = 
entry->data.get_finalized_column_ptr()->assume_mutable();
+    auto* nested_column_nullable = 
assert_cast<ColumnNullable*>(nested_column.get());
+    auto* nested_column_array =
+            
assert_cast<ColumnArray*>(nested_column_nullable->get_nested_column_ptr().get());

Review Comment:
   nested_column_nullable->get_nested_column_ptr() is just nested_column



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.h:
##########
@@ -123,35 +113,97 @@ class HierarchicalDataReader : public ColumnIterator {
         }));
 
         // build variant as container
-        auto container = vectorized::ColumnObject::create(true, false);
-        auto& container_variant = 
assert_cast<vectorized::ColumnObject&>(*container);
+        auto container = ColumnObject::create(true, false);
+        auto& container_variant = assert_cast<ColumnObject&>(*container);
 
         // add root first
-        if (_path.get_parts().size() == 1) {
-            auto& root_var =
-                    _root_reader->column->is_nullable()
-                            ? assert_cast<vectorized::ColumnObject&>(
-                                      assert_cast<vectorized::ColumnNullable&>(
-                                              *_root_reader->column)
-                                              .get_nested_column())
-                            : 
assert_cast<vectorized::ColumnObject&>(*_root_reader->column);
+        if (_path.get_parts().size() == 1 && _root_reader) {
+            auto& root_var = _root_reader->column->is_nullable()
+                                     ? assert_cast<ColumnObject&>(
+                                               
assert_cast<ColumnNullable&>(*_root_reader->column)
+                                                       .get_nested_column())
+                                     : 
assert_cast<ColumnObject&>(*_root_reader->column);
             auto column = root_var.get_root();
             auto type = root_var.get_root_type();
             container_variant.add_sub_column({}, std::move(column), type);
         }
-
+        // parent path -> subcolumns
+        std::map<PathInData, PathsWithColumnAndType> nested_subcolumns;
+        PathsWithColumnAndType non_nested_subcolumns;
         RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
-            vectorized::MutableColumnPtr column = node.data.column->get_ptr();
-            bool add = container_variant.add_sub_column(
-                    node.path.copy_pop_nfront(_path.get_parts().size()), 
std::move(column),
-                    node.data.type);
-            if (!add) {
-                return Status::InternalError("Duplicated {}, type {}", 
node.path.get_path(),
-                                             node.data.type->get_name());
+            MutableColumnPtr column = node.data.column->get_ptr();
+            PathInData real_path = 
node.path.copy_pop_nfront(_path.get_parts().size());
+
+            if (node.path.has_nested_part()) {
+                
CHECK_EQ(getTypeName(remove_nullable(node.data.type)->get_type_id()),
+                         getTypeName(TypeIndex::Array));
+                PathInData parent_path = 
node.path.get_nested_prefix_path().copy_pop_nfront(
+                        _path.get_parts().size());
+                nested_subcolumns[parent_path].emplace_back(real_path, 
column->get_ptr(),
+                                                            node.data.type);
+            } else {
+                non_nested_subcolumns.emplace_back(real_path, 
column->get_ptr(), node.data.type);
             }
             return Status::OK();
         }));
 
+        for (auto& entry : non_nested_subcolumns) {
+            DCHECK(!entry.path.has_nested_part());
+            bool add = container_variant.add_sub_column(entry.path, 
entry.column->assume_mutable(),
+                                                        entry.type);
+            if (!add) {
+                return Status::InternalError("Duplicated {}, type {}", 
entry.path.get_path(),
+                                             entry.type->get_name());
+            }
+        }
+        for (auto& entry : nested_subcolumns) {
+            MutableColumnPtr nested_object = ColumnObject::create(true, false);
+            const auto* base_array =
+                    
check_and_get_column<ColumnArray>(remove_nullable(entry.second[0].column));
+            MutableColumnPtr offset = 
base_array->get_offsets_ptr()->assume_mutable();
+            auto* nested_object_ptr = 
assert_cast<ColumnObject*>(nested_object.get());
+            // flatten nested arrays
+            for (const auto& subcolumn : entry.second) {
+                const auto& column = subcolumn.column;
+                const auto& type = subcolumn.type;
+                if (!remove_nullable(column)->is_column_array()) {
+                    return Status::InvalidArgument(
+                            "Meet none array column when flatten nested array, 
path {}, type {}",
+                            subcolumn.path.get_path(), 
subcolumn.type->get_name());
+                }
+                const auto* target_array =
+                        
check_and_get_column<ColumnArray>(remove_nullable(subcolumn.column).get());
+                if (!base_array->has_equal_offsets(*target_array)) {
+                    return Status::InvalidArgument(
+                            "Meet none equal offsets array when flatten nested 
array, path {}, "
+                            "type {}",
+                            subcolumn.path.get_path(), 
subcolumn.type->get_name());
+                }
+                MutableColumnPtr flattend_column = 
check_and_get_column<ColumnArray>(target_array)

Review Comment:
   target_array is already ColumnArray and check_and_get_column is not 
necessary.



##########
be/src/olap/rowset/segment_v2/stream_reader.h:
##########
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "olap/rowset/segment_v2/column_reader.h"
+
+namespace doris::segment_v2 {
+
+struct StreamReader {
+    vectorized::MutableColumnPtr column;
+    std::unique_ptr<ColumnIterator> iterator;
+    std::shared_ptr<const vectorized::IDataType> type;
+    bool inited = false;
+    size_t rows_read = 0;
+    StreamReader() = default;
+    StreamReader(vectorized::MutableColumnPtr&& col, 
std::unique_ptr<ColumnIterator>&& it,
+                 std::shared_ptr<const vectorized::IDataType> t)
+            : column(std::move(col)), iterator(std::move(it)), type(t) {}
+};
+
+// path -> StreamReader
+using SubstreamReaderTree = vectorized::SubcolumnsTree<StreamReader>;
+
+struct SubcolumnReader {
+    std::unique_ptr<ColumnReader> reader;
+    std::shared_ptr<const vectorized::IDataType> file_column_type;

Review Comment:
   What's the difference between reader.type and file_column_type?



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.h:
##########
@@ -123,35 +113,97 @@ class HierarchicalDataReader : public ColumnIterator {
         }));
 
         // build variant as container
-        auto container = vectorized::ColumnObject::create(true, false);
-        auto& container_variant = 
assert_cast<vectorized::ColumnObject&>(*container);
+        auto container = ColumnObject::create(true, false);
+        auto& container_variant = assert_cast<ColumnObject&>(*container);
 
         // add root first
-        if (_path.get_parts().size() == 1) {
-            auto& root_var =
-                    _root_reader->column->is_nullable()
-                            ? assert_cast<vectorized::ColumnObject&>(
-                                      assert_cast<vectorized::ColumnNullable&>(
-                                              *_root_reader->column)
-                                              .get_nested_column())
-                            : 
assert_cast<vectorized::ColumnObject&>(*_root_reader->column);
+        if (_path.get_parts().size() == 1 && _root_reader) {
+            auto& root_var = _root_reader->column->is_nullable()
+                                     ? assert_cast<ColumnObject&>(
+                                               
assert_cast<ColumnNullable&>(*_root_reader->column)
+                                                       .get_nested_column())
+                                     : 
assert_cast<ColumnObject&>(*_root_reader->column);
             auto column = root_var.get_root();
             auto type = root_var.get_root_type();
             container_variant.add_sub_column({}, std::move(column), type);
         }
-
+        // parent path -> subcolumns
+        std::map<PathInData, PathsWithColumnAndType> nested_subcolumns;
+        PathsWithColumnAndType non_nested_subcolumns;
         RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
-            vectorized::MutableColumnPtr column = node.data.column->get_ptr();
-            bool add = container_variant.add_sub_column(
-                    node.path.copy_pop_nfront(_path.get_parts().size()), 
std::move(column),
-                    node.data.type);
-            if (!add) {
-                return Status::InternalError("Duplicated {}, type {}", 
node.path.get_path(),
-                                             node.data.type->get_name());
+            MutableColumnPtr column = node.data.column->get_ptr();
+            PathInData real_path = 
node.path.copy_pop_nfront(_path.get_parts().size());
+
+            if (node.path.has_nested_part()) {
+                
CHECK_EQ(getTypeName(remove_nullable(node.data.type)->get_type_id()),
+                         getTypeName(TypeIndex::Array));
+                PathInData parent_path = 
node.path.get_nested_prefix_path().copy_pop_nfront(
+                        _path.get_parts().size());
+                nested_subcolumns[parent_path].emplace_back(real_path, 
column->get_ptr(),
+                                                            node.data.type);
+            } else {
+                non_nested_subcolumns.emplace_back(real_path, 
column->get_ptr(), node.data.type);
             }
             return Status::OK();
         }));
 
+        for (auto& entry : non_nested_subcolumns) {
+            DCHECK(!entry.path.has_nested_part());
+            bool add = container_variant.add_sub_column(entry.path, 
entry.column->assume_mutable(),
+                                                        entry.type);
+            if (!add) {
+                return Status::InternalError("Duplicated {}, type {}", 
entry.path.get_path(),
+                                             entry.type->get_name());
+            }
+        }
+        for (auto& entry : nested_subcolumns) {

Review Comment:
   Add comment to explain what's done in this loop. I think it combine 
subcolumns of type Array<XX> with the same offsets to a single variant column 
of type Array<Object>.



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.h:
##########
@@ -123,35 +113,97 @@ class HierarchicalDataReader : public ColumnIterator {
         }));
 
         // build variant as container
-        auto container = vectorized::ColumnObject::create(true, false);
-        auto& container_variant = 
assert_cast<vectorized::ColumnObject&>(*container);
+        auto container = ColumnObject::create(true, false);
+        auto& container_variant = assert_cast<ColumnObject&>(*container);
 
         // add root first
-        if (_path.get_parts().size() == 1) {
-            auto& root_var =
-                    _root_reader->column->is_nullable()
-                            ? assert_cast<vectorized::ColumnObject&>(
-                                      assert_cast<vectorized::ColumnNullable&>(
-                                              *_root_reader->column)
-                                              .get_nested_column())
-                            : 
assert_cast<vectorized::ColumnObject&>(*_root_reader->column);
+        if (_path.get_parts().size() == 1 && _root_reader) {
+            auto& root_var = _root_reader->column->is_nullable()
+                                     ? assert_cast<ColumnObject&>(
+                                               
assert_cast<ColumnNullable&>(*_root_reader->column)
+                                                       .get_nested_column())
+                                     : 
assert_cast<ColumnObject&>(*_root_reader->column);
             auto column = root_var.get_root();
             auto type = root_var.get_root_type();
             container_variant.add_sub_column({}, std::move(column), type);
         }
-
+        // parent path -> subcolumns
+        std::map<PathInData, PathsWithColumnAndType> nested_subcolumns;
+        PathsWithColumnAndType non_nested_subcolumns;
         RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) {
-            vectorized::MutableColumnPtr column = node.data.column->get_ptr();
-            bool add = container_variant.add_sub_column(
-                    node.path.copy_pop_nfront(_path.get_parts().size()), 
std::move(column),
-                    node.data.type);
-            if (!add) {
-                return Status::InternalError("Duplicated {}, type {}", 
node.path.get_path(),
-                                             node.data.type->get_name());
+            MutableColumnPtr column = node.data.column->get_ptr();
+            PathInData real_path = 
node.path.copy_pop_nfront(_path.get_parts().size());
+
+            if (node.path.has_nested_part()) {
+                
CHECK_EQ(getTypeName(remove_nullable(node.data.type)->get_type_id()),
+                         getTypeName(TypeIndex::Array));
+                PathInData parent_path = 
node.path.get_nested_prefix_path().copy_pop_nfront(
+                        _path.get_parts().size());
+                nested_subcolumns[parent_path].emplace_back(real_path, 
column->get_ptr(),
+                                                            node.data.type);
+            } else {
+                non_nested_subcolumns.emplace_back(real_path, 
column->get_ptr(), node.data.type);
             }
             return Status::OK();
         }));
 
+        for (auto& entry : non_nested_subcolumns) {
+            DCHECK(!entry.path.has_nested_part());
+            bool add = container_variant.add_sub_column(entry.path, 
entry.column->assume_mutable(),
+                                                        entry.type);
+            if (!add) {
+                return Status::InternalError("Duplicated {}, type {}", 
entry.path.get_path(),
+                                             entry.type->get_name());
+            }
+        }
+        for (auto& entry : nested_subcolumns) {
+            MutableColumnPtr nested_object = ColumnObject::create(true, false);
+            const auto* base_array =
+                    
check_and_get_column<ColumnArray>(remove_nullable(entry.second[0].column));
+            MutableColumnPtr offset = 
base_array->get_offsets_ptr()->assume_mutable();
+            auto* nested_object_ptr = 
assert_cast<ColumnObject*>(nested_object.get());
+            // flatten nested arrays
+            for (const auto& subcolumn : entry.second) {
+                const auto& column = subcolumn.column;
+                const auto& type = subcolumn.type;
+                if (!remove_nullable(column)->is_column_array()) {
+                    return Status::InvalidArgument(
+                            "Meet none array column when flatten nested array, 
path {}, type {}",
+                            subcolumn.path.get_path(), 
subcolumn.type->get_name());
+                }
+                const auto* target_array =
+                        
check_and_get_column<ColumnArray>(remove_nullable(subcolumn.column).get());
+                if (!base_array->has_equal_offsets(*target_array)) {

Review Comment:
   It's expensive to check offsets at runtime. Maybe you should just do it in 
DEBUG version.



##########
be/src/olap/rowset/segment_v2/stream_reader.h:
##########
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "olap/rowset/segment_v2/column_reader.h"
+
+namespace doris::segment_v2 {
+
+struct StreamReader {
+    vectorized::MutableColumnPtr column;
+    std::unique_ptr<ColumnIterator> iterator;
+    std::shared_ptr<const vectorized::IDataType> type;
+    bool inited = false;
+    size_t rows_read = 0;
+    StreamReader() = default;
+    StreamReader(vectorized::MutableColumnPtr&& col, 
std::unique_ptr<ColumnIterator>&& it,
+                 std::shared_ptr<const vectorized::IDataType> t)
+            : column(std::move(col)), iterator(std::move(it)), type(t) {}
+};
+
+// path -> StreamReader
+using SubstreamReaderTree = vectorized::SubcolumnsTree<StreamReader>;
+
+struct SubcolumnReader {
+    std::unique_ptr<ColumnReader> reader;
+    std::shared_ptr<const vectorized::IDataType> file_column_type;

Review Comment:
   Can you put file_column_type inside StreamReader.



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.h:
##########
@@ -168,26 +220,34 @@ class HierarchicalDataReader : public ColumnIterator {
             return Status::OK();
         }));
         container->clear();
-        if (_root_reader->column->is_nullable()) {
-            // fill nullmap
-            DCHECK(dst->is_nullable());
-            vectorized::ColumnUInt8& dst_null_map =
-                    
assert_cast<vectorized::ColumnNullable&>(*dst).get_null_map_column();
-            vectorized::ColumnUInt8& src_null_map =
-                    
assert_cast<vectorized::ColumnNullable&>(*_root_reader->column)
-                            .get_null_map_column();
-            dst_null_map.insert_range_from(src_null_map, 0, 
src_null_map.size());
-            // clear nullmap and inner data
-            src_null_map.clear();
-            assert_cast<vectorized::ColumnObject&>(
-                    
assert_cast<vectorized::ColumnNullable&>(*_root_reader->column)
-                            .get_nested_column())
-                    .clear_subcolumns_data();
+        if (_root_reader) {
+            if (_root_reader->column->is_nullable()) {
+                // fill nullmap
+                DCHECK(dst->is_nullable());
+                ColumnUInt8& dst_null_map =
+                        
assert_cast<ColumnNullable&>(*dst).get_null_map_column();
+                ColumnUInt8& src_null_map =
+                        
assert_cast<ColumnNullable&>(*_root_reader->column).get_null_map_column();
+                dst_null_map.insert_range_from(src_null_map, 0, 
src_null_map.size());
+                // clear nullmap and inner data
+                src_null_map.clear();
+                assert_cast<ColumnObject&>(
+                        
assert_cast<ColumnNullable&>(*_root_reader->column).get_nested_column())
+                        .clear_subcolumns_data();
+            } else {
+                ColumnObject& root_column = 
assert_cast<ColumnObject&>(*_root_reader->column);
+                root_column.clear_subcolumns_data();
+            }
         } else {
-            vectorized::ColumnObject& root_column =
-                    
assert_cast<vectorized::ColumnObject&>(*_root_reader->column);
-            root_column.clear_subcolumns_data();
+            if (dst->is_nullable()) {
+                // No nullable info exist in hirearchical data, fill nullmap 
with all none null
+                ColumnUInt8& dst_null_map =
+                        
assert_cast<ColumnNullable&>(*dst).get_null_map_column();
+                auto fake_nullable_column = ColumnUInt8::create(nrows, 0);

Review Comment:
   Do you mean all NULL, but 0 is for NOT NULL.



##########
be/src/olap/rowset/segment_v2/column_reader.cpp:
##########
@@ -1663,4 +1663,58 @@ Status VariantRootColumnIterator::read_by_rowids(const 
rowid_t* rowids, const si
     return Status::OK();
 }
 
+Status DefaultNestedColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst) {
+    bool has_null = false;
+    return next_batch(n, dst, &has_null);
+}
+
+static void fill_nested_with_defaults(vectorized::MutableColumnPtr& dst,
+                                      vectorized::MutableColumnPtr& 
sibling_column, size_t nrows) {
+    const auto* sibling_array = 
vectorized::check_and_get_column<vectorized::ColumnArray>(
+            remove_nullable(sibling_column->get_ptr()));
+    CHECK(sibling_array) << "Expected array column, but mmet " << 
sibling_column->get_name();

Review Comment:
   It's duplicate with `throw Exception` bellow.



##########
be/src/olap/rowset/segment_v2/column_reader.cpp:
##########
@@ -1663,4 +1663,58 @@ Status VariantRootColumnIterator::read_by_rowids(const 
rowid_t* rowids, const si
     return Status::OK();
 }
 
+Status DefaultNestedColumnIterator::next_batch(size_t* n, 
vectorized::MutableColumnPtr& dst) {
+    bool has_null = false;
+    return next_batch(n, dst, &has_null);
+}
+
+static void fill_nested_with_defaults(vectorized::MutableColumnPtr& dst,
+                                      vectorized::MutableColumnPtr& 
sibling_column, size_t nrows) {
+    const auto* sibling_array = 
vectorized::check_and_get_column<vectorized::ColumnArray>(
+            remove_nullable(sibling_column->get_ptr()));
+    CHECK(sibling_array) << "Expected array column, but mmet " << 
sibling_column->get_name();
+    ;

Review Comment:
   useless ;



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -896,18 +1062,28 @@ void ColumnObject::insert_range_from(const IColumn& src, 
size_t start, size_t le
     const auto& src_object = assert_cast<const ColumnObject&>(src);
     for (const auto& entry : src_object.subcolumns) {
         if (!has_subcolumn(entry->path)) {
-            add_sub_column(entry->path, num_rows);
+            if (entry->path.has_nested_part()) {
+                FieldInfo field_info {
+                        .scalar_type_id = 
entry->data.least_common_type.get_base_type_id(),
+                        .num_dimensions = entry->data.get_dimensions()};
+                add_nested_subcolumn(entry->path, field_info, num_rows);

Review Comment:
   ifelse branch for `add_nested_subcolumn` and 
`try_insert_many_defaults_from_nested` are repeated many times, can you combine 
`add_nested_subcolumn` and `add_sub_column`, 
`try_insert_many_defaults_from_nested` and `insert_many_defaults`.



##########
be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp:
##########
@@ -51,12 +52,15 @@ Status 
HierarchicalDataReader::create(std::unique_ptr<ColumnIterator>* reader,
     // Eg. {"a" : "b" : {"c" : 1}}, access the `a.b` path and merge with root 
path so that
     // we could make sure the data could be fully merged, since some column 
may not be extracted but remains in root
     // like {"a" : "b" : {"e" : 1.1}} in jsonb format
-    ColumnIterator* it;
-    RETURN_IF_ERROR(root->data.reader->new_iterator(&it));
-    stream_iter->set_root(std::make_unique<StreamReader>(
-            root->data.file_column_type->create_column(), 
std::unique_ptr<ColumnIterator>(it),
-            root->data.file_column_type));
+    if (read_type == ReadType::MERGE_SPARSE) {

Review Comment:
   Before this change, all type will run this branch.



##########
fe/fe-core/src/test/java/org/apache/doris/task/AgentTaskTest.java:
##########
@@ -108,7 +108,7 @@ public void setUp() throws AnalysisException {
         createReplicaTask = new CreateReplicaTask(backendId1, dbId, tableId, 
partitionId,
                 indexId1, tabletId1, replicaId1, shortKeyNum, schemaHash1, 
version, KeysType.AGG_KEYS, storageType,
                 TStorageMedium.SSD, columns, null, 0, latch, null, false, 
TTabletType.TABLET_TYPE_DISK, null,
-                TCompressionType.LZ4F, false, "", false, false, false, "", 0, 
0, 0, 0, 0, false, null, null, objectPool, rowStorePageSize);
+                TCompressionType.LZ4F, false, "", false, false, false, "", 0, 
0, 0, 0, 0, false, null, null, objectPool, rowStorePageSize, false);

Review Comment:
   It's better to test with true.



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -884,8 +996,62 @@ void ColumnObject::get(size_t n, Field& res) const {
     auto& object = res.get<VariantMap&>();
 
     for (const auto& entry : subcolumns) {
-        auto it = object.try_emplace(entry->path.get_path()).first;
-        entry->data.get(n, it->second);
+        Field field;
+        entry->data.get(n, field);
+        // Notice: we treat null as empty field
+        if (field.get_type() != Field::Types::Null) {
+            object.try_emplace(entry->path.get_path(), field);
+        }
+    }
+    if (object.empty()) {
+        res = Null();
+    }
+}
+
+void ColumnObject::add_nested_subcolumn(const PathInData& key, const 
FieldInfo& field_info,
+                                        size_t new_size) {
+    if (!key.has_nested_part()) {
+        throw doris::Exception(ErrorCode::INTERNAL_ERROR,
+                               "Cannot add Nested subcolumn, because path 
doesn't contain Nested");
+    }
+
+    bool inserted = false;
+    /// We find node that represents the same Nested type as @key.
+    const auto* nested_node = subcolumns.find_best_match(key);

Review Comment:
   add check for the last nested in path



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -884,8 +996,62 @@ void ColumnObject::get(size_t n, Field& res) const {
     auto& object = res.get<VariantMap&>();
 
     for (const auto& entry : subcolumns) {
-        auto it = object.try_emplace(entry->path.get_path()).first;
-        entry->data.get(n, it->second);
+        Field field;
+        entry->data.get(n, field);
+        // Notice: we treat null as empty field
+        if (field.get_type() != Field::Types::Null) {
+            object.try_emplace(entry->path.get_path(), field);
+        }
+    }
+    if (object.empty()) {
+        res = Null();
+    }
+}
+
+void ColumnObject::add_nested_subcolumn(const PathInData& key, const 
FieldInfo& field_info,
+                                        size_t new_size) {
+    if (!key.has_nested_part()) {
+        throw doris::Exception(ErrorCode::INTERNAL_ERROR,
+                               "Cannot add Nested subcolumn, because path 
doesn't contain Nested");
+    }
+
+    bool inserted = false;
+    /// We find node that represents the same Nested type as @key.
+    const auto* nested_node = subcolumns.find_best_match(key);

Review Comment:
   eg. request key is a.b.c, but a.b is a nested node and a.b is returned, then 
all operation is performed on a.b instead of a.c.



##########
be/src/olap/rowset/segment_v2/segment.cpp:
##########
@@ -599,6 +604,44 @@ Status Segment::new_column_iterator_with_path(const 
TabletColumn& tablet_column,
                type == ReaderType::READER_FULL_COMPACTION || type == 
ReaderType::READER_CHECKSUM;
     };
 
+    auto new_default_iter = [&]() {
+        if (tablet_column.is_nested_subcolumn() &&
+            type_to_read_flat_leaves(opt->io_ctx.reader_type)) {
+            // We find node that represents the same Nested type as path.
+            const auto* parent = 
_sub_column_tree.find_best_match(*tablet_column.path_info_ptr());
+            VLOG_DEBUG << "find with path " << 
tablet_column.path_info_ptr()->get_path()
+                       << " parent " << (parent ? parent->path.get_path() : 
"nullptr") << ", type "
+                       << ", parent is nested " << (parent ? 
parent->is_nested() : false) << ", "
+                       << 
TabletColumn::get_string_by_field_type(tablet_column.type());
+            // find it's common parent with nested part
+            // why not use parent->path->has_nested_part? because parent may 
not be a leaf node
+            // none leaf node may not contain path info
+            // Example:
+            // {"payload" : {"commits" : [{"issue" : {"id" : 123, "email" : 
"a@b"}}]}}
+            // nested node path          : payload.commits(NESTED)
+            // tablet_column path_info   : payload.commits.issue.id(SCALAR
+            // parent path node          : payload.commits.issue(TUPLE)
+            // leaf path_info            : payload.commits.issue.email(SCALAR)
+            if (parent && SubcolumnColumnReaders::find_parent(
+                                  parent, [](const auto& node) { return 
node.is_nested(); })) {
+                /// Find any leaf of Nested subcolumn.
+                const auto* leaf = SubcolumnColumnReaders::find_leaf(

Review Comment:
   Maybe you can encapsulate a function find_sibling_with_same_nested.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Re: [PR] Var nested [doris]

Reply via email to