xiaokang commented on code in PR #39022: URL: https://github.com/apache/doris/pull/39022#discussion_r1718142693
########## be/src/olap/rowset/segment_v2/segment.h: ########## @@ -162,10 +162,10 @@ class Segment : public std::enable_shared_from_this<Segment> { // nullptr will returned if storage type does not contains such column std::shared_ptr<const vectorized::IDataType> get_data_type_of(vectorized::PathInDataPtr path, bool is_nullable, - bool ignore_children) const; Review Comment: Is it safe to delete ignore_children? ########## be/src/vec/data_types/data_type_jsonb.h: ########## @@ -68,7 +68,7 @@ class DataTypeJsonb final : public IDataType { MutableColumnPtr create_column() const override; virtual Field get_default() const override { - std::string default_json = "{}"; + std::string default_json = "null"; Review Comment: Why change default value to null ? ########## be/src/vec/columns/column_object.cpp: ########## @@ -415,11 +456,77 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { data.back()->insert(field); } -void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn& src, size_t start, size_t length) { +static DataTypePtr create_array(TypeIndex type, size_t num_dimensions) { + DataTypePtr result_type = make_nullable(DataTypeFactory::instance().create_data_type(type)); + for (size_t i = 0; i < num_dimensions; ++i) { + result_type = make_nullable(std::make_shared<DataTypeArray>(result_type)); + } + return result_type; +} + +Array create_empty_array_field(size_t num_dimensions) { + if (num_dimensions == 0) { + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + "Cannot create array field with 0 dimensions"); + } + + Array array; + Array* current_array = &array; + for (size_t i = 1; i < num_dimensions; ++i) { + current_array->push_back(Array()); + current_array = ¤t_array->back().get<Array&>(); + } + + return array; +} + +// Recreates column with default scalar values and keeps sizes of arrays. +static ColumnPtr recreate_column_with_default_values(const ColumnPtr& column, TypeIndex scalar_type, + size_t num_dimensions) { + const auto* column_array = check_and_get_column<ColumnArray>(remove_nullable(column).get()); + if (column_array && num_dimensions) { + return make_nullable(ColumnArray::create( + recreate_column_with_default_values(column_array->get_data_ptr(), scalar_type, + num_dimensions - 1), + IColumn::mutate(column_array->get_offsets_ptr()))); + } + + return create_array(scalar_type, num_dimensions) + ->create_column() + ->clone_resized(column->size()); +} + +ColumnObject::Subcolumn ColumnObject::Subcolumn::recreate_with_default_values( Review Comment: suggest a better name: clone_with_default_values ########## be/src/vec/columns/column_object.cpp: ########## @@ -415,11 +456,77 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { data.back()->insert(field); } -void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn& src, size_t start, size_t length) { +static DataTypePtr create_array(TypeIndex type, size_t num_dimensions) { + DataTypePtr result_type = make_nullable(DataTypeFactory::instance().create_data_type(type)); + for (size_t i = 0; i < num_dimensions; ++i) { + result_type = make_nullable(std::make_shared<DataTypeArray>(result_type)); + } + return result_type; +} + +Array create_empty_array_field(size_t num_dimensions) { + if (num_dimensions == 0) { + throw doris::Exception(ErrorCode::INVALID_ARGUMENT, + "Cannot create array field with 0 dimensions"); + } + + Array array; + Array* current_array = &array; + for (size_t i = 1; i < num_dimensions; ++i) { + current_array->push_back(Array()); + current_array = ¤t_array->back().get<Array&>(); + } + + return array; +} + +// Recreates column with default scalar values and keeps sizes of arrays. +static ColumnPtr recreate_column_with_default_values(const ColumnPtr& column, TypeIndex scalar_type, + size_t num_dimensions) { + const auto* column_array = check_and_get_column<ColumnArray>(remove_nullable(column).get()); + if (column_array && num_dimensions) { Review Comment: Do you assume column_array's dimension is equal to num_dimensions? If they are not the same, does the function return expected result? ########## be/src/olap/rowset/segment_v2/column_reader.cpp: ########## @@ -1663,4 +1663,58 @@ Status VariantRootColumnIterator::read_by_rowids(const rowid_t* rowids, const si return Status::OK(); } +Status DefaultNestedColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst) { + bool has_null = false; + return next_batch(n, dst, &has_null); +} + +static void fill_nested_with_defaults(vectorized::MutableColumnPtr& dst, + vectorized::MutableColumnPtr& sibling_column, size_t nrows) { + const auto* sibling_array = vectorized::check_and_get_column<vectorized::ColumnArray>( + remove_nullable(sibling_column->get_ptr())); + CHECK(sibling_array) << "Expected array column, but mmet " << sibling_column->get_name(); + ; + const auto* dst_array = vectorized::check_and_get_column<vectorized::ColumnArray>( + remove_nullable(dst->get_ptr())); + if (!dst_array || !sibling_array) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Expected array column, but met %s and %s", dst->get_name(), + sibling_column->get_name()); + } + auto new_nested = + dst_array->get_data_ptr()->clone_resized(sibling_array->get_data_ptr()->size()); + auto new_array = make_nullable(vectorized::ColumnArray::create( + new_nested->assume_mutable(), sibling_array->get_offsets_ptr()->assume_mutable())); + dst->insert_range_from(*new_array, 0, new_array->size()); Review Comment: Where is the logic for default value. ########## be/src/olap/rowset/segment_v2/hierarchical_data_reader.h: ########## @@ -123,35 +113,97 @@ class HierarchicalDataReader : public ColumnIterator { })); // build variant as container - auto container = vectorized::ColumnObject::create(true, false); - auto& container_variant = assert_cast<vectorized::ColumnObject&>(*container); + auto container = ColumnObject::create(true, false); + auto& container_variant = assert_cast<ColumnObject&>(*container); // add root first - if (_path.get_parts().size() == 1) { - auto& root_var = - _root_reader->column->is_nullable() - ? assert_cast<vectorized::ColumnObject&>( - assert_cast<vectorized::ColumnNullable&>( - *_root_reader->column) - .get_nested_column()) - : assert_cast<vectorized::ColumnObject&>(*_root_reader->column); + if (_path.get_parts().size() == 1 && _root_reader) { + auto& root_var = _root_reader->column->is_nullable() + ? assert_cast<ColumnObject&>( + assert_cast<ColumnNullable&>(*_root_reader->column) + .get_nested_column()) + : assert_cast<ColumnObject&>(*_root_reader->column); auto column = root_var.get_root(); auto type = root_var.get_root_type(); container_variant.add_sub_column({}, std::move(column), type); } - + // parent path -> subcolumns + std::map<PathInData, PathsWithColumnAndType> nested_subcolumns; + PathsWithColumnAndType non_nested_subcolumns; RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) { - vectorized::MutableColumnPtr column = node.data.column->get_ptr(); - bool add = container_variant.add_sub_column( - node.path.copy_pop_nfront(_path.get_parts().size()), std::move(column), - node.data.type); - if (!add) { - return Status::InternalError("Duplicated {}, type {}", node.path.get_path(), - node.data.type->get_name()); + MutableColumnPtr column = node.data.column->get_ptr(); + PathInData real_path = node.path.copy_pop_nfront(_path.get_parts().size()); Review Comment: Is real_path a relative path from _path? If yes, use a name relative_path. ########## be/src/olap/rowset/segment_creator.cpp: ########## @@ -99,9 +100,10 @@ Status SegmentFlusher::_parse_variant_columns(vectorized::Block& block) { return Status::OK(); } - vectorized::schema_util::ParseContext ctx; - ctx.record_raw_json_column = _context.tablet_schema->has_row_store_for_all_columns(); Review Comment: Is record_raw_json_column not used any more? ########## be/src/vec/columns/column_object.h: ########## @@ -125,27 +131,33 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> { void get(size_t n, Field& res) const; - /// Checks the consistency of column's parts stored in @data. - void checkTypes() const; - /// Inserts a field, which scalars can be arbitrary, but number of /// dimensions should be consistent with current common type. /// throws InvalidArgument when meet conflict types void insert(Field field); void insert(Field field, FieldInfo info); - void insertDefault(); + void insert_default(); + + void insert_many_defaults(size_t length); - void insertManyDefaults(size_t length); + void insert_range_from(const Subcolumn& src, size_t start, size_t length); - void insertRangeFrom(const Subcolumn& src, size_t start, size_t length); + /// Recreates subcolumn with default scalar values and keeps sizes of arrays. + /// Used to create columns of type Nested with consistent array sizes. + Subcolumn recreate_with_default_values(const FieldInfo& field_info) const; void pop_back(size_t n); + Subcolumn cut(size_t start, size_t length) const; Review Comment: add comment ########## be/src/vec/columns/column_object.h: ########## @@ -94,6 +95,11 @@ class ColumnObject final : public COWHelper<IColumn, ColumnObject> { // Using jsonb type as most common type, since it's adopted all types of json using MostCommonType = DataTypeJsonb; constexpr static TypeIndex MOST_COMMON_TYPE_ID = TypeIndex::JSONB; + // Nullable(Array(Nullable(Object))) + const static DataTypePtr NESTED_TYPE; + // Finlize mode for subcolumns, write mode will deal with sparse columns, only affects in flush block to segments. Review Comment: add more explaination for 'write mode will deal with sparse columns' ########## be/src/vec/columns/column_object.cpp: ########## @@ -884,8 +996,62 @@ void ColumnObject::get(size_t n, Field& res) const { auto& object = res.get<VariantMap&>(); for (const auto& entry : subcolumns) { - auto it = object.try_emplace(entry->path.get_path()).first; - entry->data.get(n, it->second); + Field field; + entry->data.get(n, field); + // Notice: we treat null as empty field + if (field.get_type() != Field::Types::Null) { + object.try_emplace(entry->path.get_path(), field); + } + } + if (object.empty()) { + res = Null(); + } +} + +void ColumnObject::add_nested_subcolumn(const PathInData& key, const FieldInfo& field_info, + size_t new_size) { + if (!key.has_nested_part()) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Cannot add Nested subcolumn, because path doesn't contain Nested"); + } + + bool inserted = false; + /// We find node that represents the same Nested type as @key. + const auto* nested_node = subcolumns.find_best_match(key); Review Comment: find_best_match may find a prefix node. Why not use find? I wonder it may cause problem. ########## be/src/vec/columns/column_object.cpp: ########## @@ -884,8 +996,62 @@ void ColumnObject::get(size_t n, Field& res) const { auto& object = res.get<VariantMap&>(); for (const auto& entry : subcolumns) { - auto it = object.try_emplace(entry->path.get_path()).first; - entry->data.get(n, it->second); + Field field; + entry->data.get(n, field); + // Notice: we treat null as empty field Review Comment: add comment for why ########## be/src/vec/common/schema_util.cpp: ########## @@ -212,7 +220,6 @@ void get_column_by_type(const vectorized::DataTypePtr& data_type, const std::str "", child, {}); column.set_length(TabletColumn::get_field_length_by_type(TPrimitiveType::ARRAY, 0)); column.add_sub_column(child); - column.set_default_value("[]"); Review Comment: Why delete it? ########## be/src/vec/json/path_in_data.cpp: ########## @@ -118,12 +125,13 @@ void PathInData::build_parts(const Parts& other_parts) { void PathInData::from_protobuf(const segment_v2::ColumnPathInfo& pb) { parts.clear(); path = pb.path(); - has_nested = pb.has_has_nested(); + has_nested = false; Review Comment: why change it? ########## be/src/vec/functions/function_cast.h: ########## @@ -810,6 +810,20 @@ struct ConvertImplGenericToJsonb { } }; +struct ConvertNothingToJsonb { + static Status execute(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + const size_t result, size_t input_rows_count) { + const auto& col_with_type_and_name = block.get_by_position(arguments[0]); + const IColumn& col_from = *col_with_type_and_name.column; + size_t size = col_from.size(); + auto col_to = col_from.clone_resized(size); Review Comment: col_to's type is Nothing but should be JSONB ########## be/src/vec/data_types/convert_field_to_type.cpp: ########## @@ -98,6 +98,10 @@ class FieldVisitorToStringSimple : public StaticVisitor<String> { LOG(FATAL) << "not implemeted"; __builtin_unreachable(); } + [[noreturn]] String operator()(const VariantMap& x) const { + LOG(FATAL) << "not implemeted"; Review Comment: Why so many LOG FATAL? ########## be/src/vec/columns/column_object.cpp: ########## @@ -1132,6 +1298,44 @@ rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInDat return find_leaf_node_by_path(current, path, idx + 1); } +// skip empty json: +// 1. null value as empty json +// 2. nested array with only nulls, eg. [null. null] Review Comment: It maybe all null in array in original data. ########## be/src/vec/columns/subcolumn_tree.h: ########## @@ -232,6 +219,50 @@ class SubcolumnsTree { return nullptr; } + /// Find leaf by path. + const Node* find_leaf(const PathInData& path) const { + const auto* candidate = find_exact(path); + if (!candidate || !candidate->is_scalar()) { + return nullptr; + } + return candidate; + } + + const Node* get_leaf_of_the_same_nested(const PathInData& path, + const NodePredicate& pred) const { + if (!path.has_nested_part()) { + return nullptr; + } + + const auto* current_node = find_leaf(path); + const Node* leaf = nullptr; + + while (current_node) { + /// Try to find the first Nested up to the current node. + const auto* node_nested = find_parent(current_node, [](const auto& candidate) -> bool { + return candidate.is_nested(); + }); + + if (!node_nested) { + break; + } + + /// Find the leaf with subcolumn that contains values + /// for the last rows. + /// If there are no leaves, skip current node and find + /// the next node up to the current. + leaf = SubcolumnsTree<NodeData>::find_leaf(node_nested, pred); + + if (leaf) { + break; + } + + current_node = node_nested->parent; Review Comment: It will try other anstor nested parent, so it's not always `get_leaf_of_the_same_nested`. ########## be/src/vec/columns/column_object.cpp: ########## @@ -384,9 +425,9 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { } if (data.empty()) { add_new_column_part(create_array_of_type(base_type.idx, value_dim, is_nullable)); - } else if (least_common_type.get_type_id() != base_type.idx && !base_type.is_nothing()) { - if (schema_util::is_conversion_required_between_integers(base_type.idx, - least_common_type.get_type_id())) { + } else if (least_common_type.get_base_type_id() != base_type.idx && !base_type.is_nothing()) { Review Comment: ? ########## be/src/vec/columns/column_object.cpp: ########## @@ -1381,34 +1585,72 @@ Status ColumnObject::merge_sparse_to_root_column() { return Status::OK(); } -void ColumnObject::finalize_if_not() { - if (!is_finalized()) { - finalize(); - } -} - -void ColumnObject::finalize(bool ignore_sparse) { +void ColumnObject::unnest(Subcolumns::NodePtr& entry, Subcolumns& subcolumns) const { + entry->data.finalize(); + auto nested_column = entry->data.get_finalized_column_ptr()->assume_mutable(); + auto* nested_column_nullable = assert_cast<ColumnNullable*>(nested_column.get()); + auto* nested_column_array = + assert_cast<ColumnArray*>(nested_column_nullable->get_nested_column_ptr().get()); Review Comment: nested_column_nullable->get_nested_column_ptr() is just nested_column ########## be/src/olap/rowset/segment_v2/hierarchical_data_reader.h: ########## @@ -123,35 +113,97 @@ class HierarchicalDataReader : public ColumnIterator { })); // build variant as container - auto container = vectorized::ColumnObject::create(true, false); - auto& container_variant = assert_cast<vectorized::ColumnObject&>(*container); + auto container = ColumnObject::create(true, false); + auto& container_variant = assert_cast<ColumnObject&>(*container); // add root first - if (_path.get_parts().size() == 1) { - auto& root_var = - _root_reader->column->is_nullable() - ? assert_cast<vectorized::ColumnObject&>( - assert_cast<vectorized::ColumnNullable&>( - *_root_reader->column) - .get_nested_column()) - : assert_cast<vectorized::ColumnObject&>(*_root_reader->column); + if (_path.get_parts().size() == 1 && _root_reader) { + auto& root_var = _root_reader->column->is_nullable() + ? assert_cast<ColumnObject&>( + assert_cast<ColumnNullable&>(*_root_reader->column) + .get_nested_column()) + : assert_cast<ColumnObject&>(*_root_reader->column); auto column = root_var.get_root(); auto type = root_var.get_root_type(); container_variant.add_sub_column({}, std::move(column), type); } - + // parent path -> subcolumns + std::map<PathInData, PathsWithColumnAndType> nested_subcolumns; + PathsWithColumnAndType non_nested_subcolumns; RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) { - vectorized::MutableColumnPtr column = node.data.column->get_ptr(); - bool add = container_variant.add_sub_column( - node.path.copy_pop_nfront(_path.get_parts().size()), std::move(column), - node.data.type); - if (!add) { - return Status::InternalError("Duplicated {}, type {}", node.path.get_path(), - node.data.type->get_name()); + MutableColumnPtr column = node.data.column->get_ptr(); + PathInData real_path = node.path.copy_pop_nfront(_path.get_parts().size()); + + if (node.path.has_nested_part()) { + CHECK_EQ(getTypeName(remove_nullable(node.data.type)->get_type_id()), + getTypeName(TypeIndex::Array)); + PathInData parent_path = node.path.get_nested_prefix_path().copy_pop_nfront( + _path.get_parts().size()); + nested_subcolumns[parent_path].emplace_back(real_path, column->get_ptr(), + node.data.type); + } else { + non_nested_subcolumns.emplace_back(real_path, column->get_ptr(), node.data.type); } return Status::OK(); })); + for (auto& entry : non_nested_subcolumns) { + DCHECK(!entry.path.has_nested_part()); + bool add = container_variant.add_sub_column(entry.path, entry.column->assume_mutable(), + entry.type); + if (!add) { + return Status::InternalError("Duplicated {}, type {}", entry.path.get_path(), + entry.type->get_name()); + } + } + for (auto& entry : nested_subcolumns) { + MutableColumnPtr nested_object = ColumnObject::create(true, false); + const auto* base_array = + check_and_get_column<ColumnArray>(remove_nullable(entry.second[0].column)); + MutableColumnPtr offset = base_array->get_offsets_ptr()->assume_mutable(); + auto* nested_object_ptr = assert_cast<ColumnObject*>(nested_object.get()); + // flatten nested arrays + for (const auto& subcolumn : entry.second) { + const auto& column = subcolumn.column; + const auto& type = subcolumn.type; + if (!remove_nullable(column)->is_column_array()) { + return Status::InvalidArgument( + "Meet none array column when flatten nested array, path {}, type {}", + subcolumn.path.get_path(), subcolumn.type->get_name()); + } + const auto* target_array = + check_and_get_column<ColumnArray>(remove_nullable(subcolumn.column).get()); + if (!base_array->has_equal_offsets(*target_array)) { + return Status::InvalidArgument( + "Meet none equal offsets array when flatten nested array, path {}, " + "type {}", + subcolumn.path.get_path(), subcolumn.type->get_name()); + } + MutableColumnPtr flattend_column = check_and_get_column<ColumnArray>(target_array) Review Comment: target_array is already ColumnArray and check_and_get_column is not necessary. ########## be/src/olap/rowset/segment_v2/stream_reader.h: ########## @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> + +#include "olap/rowset/segment_v2/column_reader.h" + +namespace doris::segment_v2 { + +struct StreamReader { + vectorized::MutableColumnPtr column; + std::unique_ptr<ColumnIterator> iterator; + std::shared_ptr<const vectorized::IDataType> type; + bool inited = false; + size_t rows_read = 0; + StreamReader() = default; + StreamReader(vectorized::MutableColumnPtr&& col, std::unique_ptr<ColumnIterator>&& it, + std::shared_ptr<const vectorized::IDataType> t) + : column(std::move(col)), iterator(std::move(it)), type(t) {} +}; + +// path -> StreamReader +using SubstreamReaderTree = vectorized::SubcolumnsTree<StreamReader>; + +struct SubcolumnReader { + std::unique_ptr<ColumnReader> reader; + std::shared_ptr<const vectorized::IDataType> file_column_type; Review Comment: What's the difference between reader.type and file_column_type? ########## be/src/olap/rowset/segment_v2/hierarchical_data_reader.h: ########## @@ -123,35 +113,97 @@ class HierarchicalDataReader : public ColumnIterator { })); // build variant as container - auto container = vectorized::ColumnObject::create(true, false); - auto& container_variant = assert_cast<vectorized::ColumnObject&>(*container); + auto container = ColumnObject::create(true, false); + auto& container_variant = assert_cast<ColumnObject&>(*container); // add root first - if (_path.get_parts().size() == 1) { - auto& root_var = - _root_reader->column->is_nullable() - ? assert_cast<vectorized::ColumnObject&>( - assert_cast<vectorized::ColumnNullable&>( - *_root_reader->column) - .get_nested_column()) - : assert_cast<vectorized::ColumnObject&>(*_root_reader->column); + if (_path.get_parts().size() == 1 && _root_reader) { + auto& root_var = _root_reader->column->is_nullable() + ? assert_cast<ColumnObject&>( + assert_cast<ColumnNullable&>(*_root_reader->column) + .get_nested_column()) + : assert_cast<ColumnObject&>(*_root_reader->column); auto column = root_var.get_root(); auto type = root_var.get_root_type(); container_variant.add_sub_column({}, std::move(column), type); } - + // parent path -> subcolumns + std::map<PathInData, PathsWithColumnAndType> nested_subcolumns; + PathsWithColumnAndType non_nested_subcolumns; RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) { - vectorized::MutableColumnPtr column = node.data.column->get_ptr(); - bool add = container_variant.add_sub_column( - node.path.copy_pop_nfront(_path.get_parts().size()), std::move(column), - node.data.type); - if (!add) { - return Status::InternalError("Duplicated {}, type {}", node.path.get_path(), - node.data.type->get_name()); + MutableColumnPtr column = node.data.column->get_ptr(); + PathInData real_path = node.path.copy_pop_nfront(_path.get_parts().size()); + + if (node.path.has_nested_part()) { + CHECK_EQ(getTypeName(remove_nullable(node.data.type)->get_type_id()), + getTypeName(TypeIndex::Array)); + PathInData parent_path = node.path.get_nested_prefix_path().copy_pop_nfront( + _path.get_parts().size()); + nested_subcolumns[parent_path].emplace_back(real_path, column->get_ptr(), + node.data.type); + } else { + non_nested_subcolumns.emplace_back(real_path, column->get_ptr(), node.data.type); } return Status::OK(); })); + for (auto& entry : non_nested_subcolumns) { + DCHECK(!entry.path.has_nested_part()); + bool add = container_variant.add_sub_column(entry.path, entry.column->assume_mutable(), + entry.type); + if (!add) { + return Status::InternalError("Duplicated {}, type {}", entry.path.get_path(), + entry.type->get_name()); + } + } + for (auto& entry : nested_subcolumns) { Review Comment: Add comment to explain what's done in this loop. I think it combine subcolumns of type Array<XX> with the same offsets to a single variant column of type Array<Object>. ########## be/src/olap/rowset/segment_v2/hierarchical_data_reader.h: ########## @@ -123,35 +113,97 @@ class HierarchicalDataReader : public ColumnIterator { })); // build variant as container - auto container = vectorized::ColumnObject::create(true, false); - auto& container_variant = assert_cast<vectorized::ColumnObject&>(*container); + auto container = ColumnObject::create(true, false); + auto& container_variant = assert_cast<ColumnObject&>(*container); // add root first - if (_path.get_parts().size() == 1) { - auto& root_var = - _root_reader->column->is_nullable() - ? assert_cast<vectorized::ColumnObject&>( - assert_cast<vectorized::ColumnNullable&>( - *_root_reader->column) - .get_nested_column()) - : assert_cast<vectorized::ColumnObject&>(*_root_reader->column); + if (_path.get_parts().size() == 1 && _root_reader) { + auto& root_var = _root_reader->column->is_nullable() + ? assert_cast<ColumnObject&>( + assert_cast<ColumnNullable&>(*_root_reader->column) + .get_nested_column()) + : assert_cast<ColumnObject&>(*_root_reader->column); auto column = root_var.get_root(); auto type = root_var.get_root_type(); container_variant.add_sub_column({}, std::move(column), type); } - + // parent path -> subcolumns + std::map<PathInData, PathsWithColumnAndType> nested_subcolumns; + PathsWithColumnAndType non_nested_subcolumns; RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) { - vectorized::MutableColumnPtr column = node.data.column->get_ptr(); - bool add = container_variant.add_sub_column( - node.path.copy_pop_nfront(_path.get_parts().size()), std::move(column), - node.data.type); - if (!add) { - return Status::InternalError("Duplicated {}, type {}", node.path.get_path(), - node.data.type->get_name()); + MutableColumnPtr column = node.data.column->get_ptr(); + PathInData real_path = node.path.copy_pop_nfront(_path.get_parts().size()); + + if (node.path.has_nested_part()) { + CHECK_EQ(getTypeName(remove_nullable(node.data.type)->get_type_id()), + getTypeName(TypeIndex::Array)); + PathInData parent_path = node.path.get_nested_prefix_path().copy_pop_nfront( + _path.get_parts().size()); + nested_subcolumns[parent_path].emplace_back(real_path, column->get_ptr(), + node.data.type); + } else { + non_nested_subcolumns.emplace_back(real_path, column->get_ptr(), node.data.type); } return Status::OK(); })); + for (auto& entry : non_nested_subcolumns) { + DCHECK(!entry.path.has_nested_part()); + bool add = container_variant.add_sub_column(entry.path, entry.column->assume_mutable(), + entry.type); + if (!add) { + return Status::InternalError("Duplicated {}, type {}", entry.path.get_path(), + entry.type->get_name()); + } + } + for (auto& entry : nested_subcolumns) { + MutableColumnPtr nested_object = ColumnObject::create(true, false); + const auto* base_array = + check_and_get_column<ColumnArray>(remove_nullable(entry.second[0].column)); + MutableColumnPtr offset = base_array->get_offsets_ptr()->assume_mutable(); + auto* nested_object_ptr = assert_cast<ColumnObject*>(nested_object.get()); + // flatten nested arrays + for (const auto& subcolumn : entry.second) { + const auto& column = subcolumn.column; + const auto& type = subcolumn.type; + if (!remove_nullable(column)->is_column_array()) { + return Status::InvalidArgument( + "Meet none array column when flatten nested array, path {}, type {}", + subcolumn.path.get_path(), subcolumn.type->get_name()); + } + const auto* target_array = + check_and_get_column<ColumnArray>(remove_nullable(subcolumn.column).get()); + if (!base_array->has_equal_offsets(*target_array)) { Review Comment: It's expensive to check offsets at runtime. Maybe you should just do it in DEBUG version. ########## be/src/olap/rowset/segment_v2/stream_reader.h: ########## @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> + +#include "olap/rowset/segment_v2/column_reader.h" + +namespace doris::segment_v2 { + +struct StreamReader { + vectorized::MutableColumnPtr column; + std::unique_ptr<ColumnIterator> iterator; + std::shared_ptr<const vectorized::IDataType> type; + bool inited = false; + size_t rows_read = 0; + StreamReader() = default; + StreamReader(vectorized::MutableColumnPtr&& col, std::unique_ptr<ColumnIterator>&& it, + std::shared_ptr<const vectorized::IDataType> t) + : column(std::move(col)), iterator(std::move(it)), type(t) {} +}; + +// path -> StreamReader +using SubstreamReaderTree = vectorized::SubcolumnsTree<StreamReader>; + +struct SubcolumnReader { + std::unique_ptr<ColumnReader> reader; + std::shared_ptr<const vectorized::IDataType> file_column_type; Review Comment: Can you put file_column_type inside StreamReader. ########## be/src/olap/rowset/segment_v2/hierarchical_data_reader.h: ########## @@ -168,26 +220,34 @@ class HierarchicalDataReader : public ColumnIterator { return Status::OK(); })); container->clear(); - if (_root_reader->column->is_nullable()) { - // fill nullmap - DCHECK(dst->is_nullable()); - vectorized::ColumnUInt8& dst_null_map = - assert_cast<vectorized::ColumnNullable&>(*dst).get_null_map_column(); - vectorized::ColumnUInt8& src_null_map = - assert_cast<vectorized::ColumnNullable&>(*_root_reader->column) - .get_null_map_column(); - dst_null_map.insert_range_from(src_null_map, 0, src_null_map.size()); - // clear nullmap and inner data - src_null_map.clear(); - assert_cast<vectorized::ColumnObject&>( - assert_cast<vectorized::ColumnNullable&>(*_root_reader->column) - .get_nested_column()) - .clear_subcolumns_data(); + if (_root_reader) { + if (_root_reader->column->is_nullable()) { + // fill nullmap + DCHECK(dst->is_nullable()); + ColumnUInt8& dst_null_map = + assert_cast<ColumnNullable&>(*dst).get_null_map_column(); + ColumnUInt8& src_null_map = + assert_cast<ColumnNullable&>(*_root_reader->column).get_null_map_column(); + dst_null_map.insert_range_from(src_null_map, 0, src_null_map.size()); + // clear nullmap and inner data + src_null_map.clear(); + assert_cast<ColumnObject&>( + assert_cast<ColumnNullable&>(*_root_reader->column).get_nested_column()) + .clear_subcolumns_data(); + } else { + ColumnObject& root_column = assert_cast<ColumnObject&>(*_root_reader->column); + root_column.clear_subcolumns_data(); + } } else { - vectorized::ColumnObject& root_column = - assert_cast<vectorized::ColumnObject&>(*_root_reader->column); - root_column.clear_subcolumns_data(); + if (dst->is_nullable()) { + // No nullable info exist in hirearchical data, fill nullmap with all none null + ColumnUInt8& dst_null_map = + assert_cast<ColumnNullable&>(*dst).get_null_map_column(); + auto fake_nullable_column = ColumnUInt8::create(nrows, 0); Review Comment: Do you mean all NULL, but 0 is for NOT NULL. ########## be/src/olap/rowset/segment_v2/column_reader.cpp: ########## @@ -1663,4 +1663,58 @@ Status VariantRootColumnIterator::read_by_rowids(const rowid_t* rowids, const si return Status::OK(); } +Status DefaultNestedColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst) { + bool has_null = false; + return next_batch(n, dst, &has_null); +} + +static void fill_nested_with_defaults(vectorized::MutableColumnPtr& dst, + vectorized::MutableColumnPtr& sibling_column, size_t nrows) { + const auto* sibling_array = vectorized::check_and_get_column<vectorized::ColumnArray>( + remove_nullable(sibling_column->get_ptr())); + CHECK(sibling_array) << "Expected array column, but mmet " << sibling_column->get_name(); Review Comment: It's duplicate with `throw Exception` bellow. ########## be/src/olap/rowset/segment_v2/column_reader.cpp: ########## @@ -1663,4 +1663,58 @@ Status VariantRootColumnIterator::read_by_rowids(const rowid_t* rowids, const si return Status::OK(); } +Status DefaultNestedColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst) { + bool has_null = false; + return next_batch(n, dst, &has_null); +} + +static void fill_nested_with_defaults(vectorized::MutableColumnPtr& dst, + vectorized::MutableColumnPtr& sibling_column, size_t nrows) { + const auto* sibling_array = vectorized::check_and_get_column<vectorized::ColumnArray>( + remove_nullable(sibling_column->get_ptr())); + CHECK(sibling_array) << "Expected array column, but mmet " << sibling_column->get_name(); + ; Review Comment: useless ; ########## be/src/vec/columns/column_object.cpp: ########## @@ -896,18 +1062,28 @@ void ColumnObject::insert_range_from(const IColumn& src, size_t start, size_t le const auto& src_object = assert_cast<const ColumnObject&>(src); for (const auto& entry : src_object.subcolumns) { if (!has_subcolumn(entry->path)) { - add_sub_column(entry->path, num_rows); + if (entry->path.has_nested_part()) { + FieldInfo field_info { + .scalar_type_id = entry->data.least_common_type.get_base_type_id(), + .num_dimensions = entry->data.get_dimensions()}; + add_nested_subcolumn(entry->path, field_info, num_rows); Review Comment: ifelse branch for `add_nested_subcolumn` and `try_insert_many_defaults_from_nested` are repeated many times, can you combine `add_nested_subcolumn` and `add_sub_column`, `try_insert_many_defaults_from_nested` and `insert_many_defaults`. ########## be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp: ########## @@ -51,12 +52,15 @@ Status HierarchicalDataReader::create(std::unique_ptr<ColumnIterator>* reader, // Eg. {"a" : "b" : {"c" : 1}}, access the `a.b` path and merge with root path so that // we could make sure the data could be fully merged, since some column may not be extracted but remains in root // like {"a" : "b" : {"e" : 1.1}} in jsonb format - ColumnIterator* it; - RETURN_IF_ERROR(root->data.reader->new_iterator(&it)); - stream_iter->set_root(std::make_unique<StreamReader>( - root->data.file_column_type->create_column(), std::unique_ptr<ColumnIterator>(it), - root->data.file_column_type)); + if (read_type == ReadType::MERGE_SPARSE) { Review Comment: Before this change, all type will run this branch. ########## fe/fe-core/src/test/java/org/apache/doris/task/AgentTaskTest.java: ########## @@ -108,7 +108,7 @@ public void setUp() throws AnalysisException { createReplicaTask = new CreateReplicaTask(backendId1, dbId, tableId, partitionId, indexId1, tabletId1, replicaId1, shortKeyNum, schemaHash1, version, KeysType.AGG_KEYS, storageType, TStorageMedium.SSD, columns, null, 0, latch, null, false, TTabletType.TABLET_TYPE_DISK, null, - TCompressionType.LZ4F, false, "", false, false, false, "", 0, 0, 0, 0, 0, false, null, null, objectPool, rowStorePageSize); + TCompressionType.LZ4F, false, "", false, false, false, "", 0, 0, 0, 0, 0, false, null, null, objectPool, rowStorePageSize, false); Review Comment: It's better to test with true. ########## be/src/vec/columns/column_object.cpp: ########## @@ -884,8 +996,62 @@ void ColumnObject::get(size_t n, Field& res) const { auto& object = res.get<VariantMap&>(); for (const auto& entry : subcolumns) { - auto it = object.try_emplace(entry->path.get_path()).first; - entry->data.get(n, it->second); + Field field; + entry->data.get(n, field); + // Notice: we treat null as empty field + if (field.get_type() != Field::Types::Null) { + object.try_emplace(entry->path.get_path(), field); + } + } + if (object.empty()) { + res = Null(); + } +} + +void ColumnObject::add_nested_subcolumn(const PathInData& key, const FieldInfo& field_info, + size_t new_size) { + if (!key.has_nested_part()) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Cannot add Nested subcolumn, because path doesn't contain Nested"); + } + + bool inserted = false; + /// We find node that represents the same Nested type as @key. + const auto* nested_node = subcolumns.find_best_match(key); Review Comment: add check for the last nested in path ########## be/src/vec/columns/column_object.cpp: ########## @@ -884,8 +996,62 @@ void ColumnObject::get(size_t n, Field& res) const { auto& object = res.get<VariantMap&>(); for (const auto& entry : subcolumns) { - auto it = object.try_emplace(entry->path.get_path()).first; - entry->data.get(n, it->second); + Field field; + entry->data.get(n, field); + // Notice: we treat null as empty field + if (field.get_type() != Field::Types::Null) { + object.try_emplace(entry->path.get_path(), field); + } + } + if (object.empty()) { + res = Null(); + } +} + +void ColumnObject::add_nested_subcolumn(const PathInData& key, const FieldInfo& field_info, + size_t new_size) { + if (!key.has_nested_part()) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Cannot add Nested subcolumn, because path doesn't contain Nested"); + } + + bool inserted = false; + /// We find node that represents the same Nested type as @key. + const auto* nested_node = subcolumns.find_best_match(key); Review Comment: eg. request key is a.b.c, but a.b is a nested node and a.b is returned, then all operation is performed on a.b instead of a.c. ########## be/src/olap/rowset/segment_v2/segment.cpp: ########## @@ -599,6 +604,44 @@ Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column, type == ReaderType::READER_FULL_COMPACTION || type == ReaderType::READER_CHECKSUM; }; + auto new_default_iter = [&]() { + if (tablet_column.is_nested_subcolumn() && + type_to_read_flat_leaves(opt->io_ctx.reader_type)) { + // We find node that represents the same Nested type as path. + const auto* parent = _sub_column_tree.find_best_match(*tablet_column.path_info_ptr()); + VLOG_DEBUG << "find with path " << tablet_column.path_info_ptr()->get_path() + << " parent " << (parent ? parent->path.get_path() : "nullptr") << ", type " + << ", parent is nested " << (parent ? parent->is_nested() : false) << ", " + << TabletColumn::get_string_by_field_type(tablet_column.type()); + // find it's common parent with nested part + // why not use parent->path->has_nested_part? because parent may not be a leaf node + // none leaf node may not contain path info + // Example: + // {"payload" : {"commits" : [{"issue" : {"id" : 123, "email" : "a@b"}}]}} + // nested node path : payload.commits(NESTED) + // tablet_column path_info : payload.commits.issue.id(SCALAR + // parent path node : payload.commits.issue(TUPLE) + // leaf path_info : payload.commits.issue.email(SCALAR) + if (parent && SubcolumnColumnReaders::find_parent( + parent, [](const auto& node) { return node.is_nested(); })) { + /// Find any leaf of Nested subcolumn. + const auto* leaf = SubcolumnColumnReaders::find_leaf( Review Comment: Maybe you can encapsulate a function find_sibling_with_same_nested. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org