github-actions[bot] commented on code in PR #45492: URL: https://github.com/apache/doris/pull/45492#discussion_r1886886120
########## be/src/olap/rowset/segment_v2/column_reader.cpp: ########## @@ -220,6 +222,146 @@ Status ColumnReader::create_agg_state(const ColumnReaderOptions& opts, const Col agg_state_type->get_name(), int(type)); } +const SubcolumnColumnReaders::Node* VariantColumnReader::get_reader_by_path( + const vectorized::PathInData& relative_path) const { + return _subcolumn_readers->find_leaf(relative_path); +} + +Status VariantColumnReader::new_iterator(ColumnIterator** iterator, Review Comment: warning: function 'new_iterator' has cognitive complexity of 51 (threshold 50) [readability-function-cognitive-complexity] ```cpp Status VariantColumnReader::new_iterator(ColumnIterator** iterator, ^ ``` <details> <summary>Additional context</summary> **be/src/olap/rowset/segment_v2/column_reader.cpp:237:** +1, including nesting penalty of 0, nesting level increased to 1 ```cpp if (node != nullptr) { ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:238:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp if (node->is_leaf_node()) { ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:242:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp RETURN_IF_ERROR(node->data.reader->new_iterator(iterator)); ^ ``` **be/src/common/status.h:632:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:242:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(node->data.reader->new_iterator(iterator)); ^ ``` **be/src/common/status.h:634:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:243:** +1, nesting level increased to 2 ```cpp } else { ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:247:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp if (!_sparse_column_set_in_stats.empty()) { ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:250:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&iter)); ^ ``` **be/src/common/status.h:632:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:250:** +5, including nesting penalty of 4, nesting level increased to 5 ```cpp RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&iter)); ^ ``` **be/src/common/status.h:634:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:255:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp (relative_path == root->path) ? HierarchicalDataReader::ReadType::MERGE_ROOT ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:257:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp RETURN_IF_ERROR(HierarchicalDataReader::create(iterator, relative_path, node, root, ^ ``` **be/src/common/status.h:632:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:257:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(HierarchicalDataReader::create(iterator, relative_path, node, root, ^ ``` **be/src/common/status.h:634:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:260:** +1, nesting level increased to 1 ```cpp } else { ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:261:** +2, including nesting penalty of 1, nesting level increased to 2 ```cpp if (_sparse_column_set_in_stats.contains(StringRef {relative_path.get_path()}) || ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:266:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&inner_iter)); ^ ``` **be/src/common/status.h:632:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:266:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(_sparse_column_reader->new_iterator(&inner_iter)); ^ ``` **be/src/common/status.h:634:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:269:** +1, nesting level increased to 2 ```cpp } else { ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:272:** +3, including nesting penalty of 2, nesting level increased to 3 ```cpp RETURN_IF_ERROR(Segment::new_default_iterator(target_col, &iter)); ^ ``` **be/src/common/status.h:632:** expanded from macro 'RETURN_IF_ERROR' ```cpp do { \ ^ ``` **be/src/olap/rowset/segment_v2/column_reader.cpp:272:** +4, including nesting penalty of 3, nesting level increased to 4 ```cpp RETURN_IF_ERROR(Segment::new_default_iterator(target_col, &iter)); ^ ``` **be/src/common/status.h:634:** expanded from macro 'RETURN_IF_ERROR' ```cpp if (UNLIKELY(!_status_.ok())) { \ ^ ``` </details> ########## be/src/vec/columns/column_object.cpp: ########## @@ -1717,89 +1776,203 @@ void get_json_by_column_tree(rapidjson::Value& root, rapidjson::Document::Alloca } Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string* output) const { - if (!is_finalized()) { - const_cast<ColumnObject*>(this)->finalize(); - } - rapidjson::StringBuffer buf; - if (is_scalar_variant()) { + // if (!is_finalized()) { + // const_cast<ColumnObject*>(this)->finalize(); + // } + if (is_scalar_variant() && is_finalized()) { auto type = get_root_type(); *output = type->to_string(*get_root(), row); return Status::OK(); } - RETURN_IF_ERROR(serialize_one_row_to_json_format(row, &buf, nullptr)); - // TODO avoid copy - *output = std::string(buf.GetString(), buf.GetSize()); + // TODO preallocate memory + auto tmp_col = ColumnString::create(); + VectorBufferWriter write_buffer(*tmp_col.get()); + RETURN_IF_ERROR(serialize_one_row_to_json_format(row, write_buffer, nullptr)); + write_buffer.commit(); + auto str_ref = tmp_col->get_data_at(0); + *output = std::string(str_ref.data, str_ref.size); return Status::OK(); } Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable& output) const { - if (!is_finalized()) { - const_cast<ColumnObject*>(this)->finalize(); - } - if (is_scalar_variant()) { + // if (!is_finalized()) { + // const_cast<ColumnObject*>(this)->finalize(); + // } + if (is_scalar_variant() && is_finalized()) { auto type = get_root_type(); type->to_string(*get_root(), row, output); return Status::OK(); } - rapidjson::StringBuffer buf; - RETURN_IF_ERROR(serialize_one_row_to_json_format(row, &buf, nullptr)); - output.write(buf.GetString(), buf.GetLength()); + RETURN_IF_ERROR(serialize_one_row_to_json_format(row, output, nullptr)); return Status::OK(); } -Status ColumnObject::serialize_one_row_to_json_format(int64_t row, rapidjson::StringBuffer* output, - bool* is_null) const { - CHECK(is_finalized()); - if (subcolumns.empty()) { - if (is_null != nullptr) { - *is_null = true; - } else { - rapidjson::Value root(rapidjson::kNullType); - rapidjson::Writer<rapidjson::StringBuffer> writer(*output); - if (!root.Accept(writer)) { - return Status::InternalError("Failed to serialize json value"); +/// Struct that represents elements of the JSON path. +/// "a.b.c" -> ["a", "b", "c"] +struct PathElements { + explicit PathElements(const String& path) { + const char* start = path.data(); + const char* end = start + path.size(); + const char* pos = start; + const char* last_dot_pos = pos - 1; + for (pos = start; pos != end; ++pos) { + if (*pos == '.') { + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); + last_dot_pos = pos; } } - return Status::OK(); + + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); } - CHECK(size() > row); - rapidjson::StringBuffer buffer; - rapidjson::Value root(rapidjson::kNullType); - if (doc_structure == nullptr) { - doc_structure = std::make_shared<rapidjson::Document>(); - rapidjson::Document::AllocatorType& allocator = doc_structure->GetAllocator(); - get_json_by_column_tree(*doc_structure, allocator, subcolumns.get_root()); + + size_t size() const { return elements.size(); } + + std::vector<std::string_view> elements; +}; + +/// Struct that represents a prefix of a JSON path. Used during output of the JSON object. +struct Prefix { + /// Shrink current prefix to the common prefix of current prefix and specified path. + /// For example, if current prefix is a.b.c.d and path is a.b.e, then shrink the prefix to a.b. + void shrink_to_common_prefix(const PathElements& path_elements) { + /// Don't include last element in path_elements in the prefix. + size_t i = 0; + while (i != elements.size() && i != (path_elements.elements.size() - 1) && + elements[i].first == path_elements.elements[i]) + ++i; Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion elements[i].first == path_elements.elements[i]) { ++i; } ``` ########## be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp: ########## @@ -150,95 +156,295 @@ ordinal_t HierarchicalDataReader::get_current_ordinal() const { return (*_substream_reader.begin())->data.iterator->get_current_ordinal(); } -Status ExtractReader::init(const ColumnIteratorOptions& opts) { - if (!_root_reader->inited) { - RETURN_IF_ERROR(_root_reader->iterator->init(opts)); - _root_reader->inited = true; +Status HierarchicalDataReader::_process_sub_columns( + vectorized::ColumnObject& container_variant, + const vectorized::PathsWithColumnAndType& non_nested_subcolumns) { + for (const auto& entry : non_nested_subcolumns) { + DCHECK(!entry.path.has_nested_part()); + bool add = container_variant.add_sub_column(entry.path, entry.column->assume_mutable(), + entry.type); + if (!add) { + return Status::InternalError("Duplicated {}, type {}", entry.path.get_path(), + entry.type->get_name()); + } } return Status::OK(); } -Status ExtractReader::seek_to_first() { - LOG(FATAL) << "Not implemented"; - __builtin_unreachable(); +Status HierarchicalDataReader::_process_nested_columns( + vectorized::ColumnObject& container_variant, + const std::map<vectorized::PathInData, vectorized::PathsWithColumnAndType>& + nested_subcolumns) { + using namespace vectorized; + // Iterate nested subcolumns and flatten them, the entry contains the nested subcolumns of the same nested parent + // first we pick the first subcolumn as base array and using it's offset info. Then we flatten all nested subcolumns + // into a new object column and wrap it with array column using the first element offsets.The wrapped array column + // will type the type of ColumnObject::NESTED_TYPE, whih is Nullable<ColumnArray<NULLABLE(ColumnObject)>>. + for (const auto& entry : nested_subcolumns) { + MutableColumnPtr nested_object = ColumnObject::create(true, false); + const auto* base_array = + check_and_get_column<ColumnArray>(remove_nullable(entry.second[0].column)); + MutableColumnPtr offset = base_array->get_offsets_ptr()->assume_mutable(); + auto* nested_object_ptr = assert_cast<ColumnObject*>(nested_object.get()); + // flatten nested arrays + for (const auto& subcolumn : entry.second) { + const auto& column = subcolumn.column; + const auto& type = subcolumn.type; + if (!remove_nullable(column)->is_column_array()) { + return Status::InvalidArgument( + "Meet none array column when flatten nested array, path {}, type {}", + subcolumn.path.get_path(), subcolumn.type->get_name()); + } + const auto* target_array = + check_and_get_column<ColumnArray>(remove_nullable(subcolumn.column).get()); +#ifndef NDEBUG + if (!base_array->has_equal_offsets(*target_array)) { + return Status::InvalidArgument( + "Meet none equal offsets array when flatten nested array, path {}, " + "type {}", + subcolumn.path.get_path(), subcolumn.type->get_name()); + } +#endif + MutableColumnPtr flattend_column = target_array->get_data_ptr()->assume_mutable(); + DataTypePtr flattend_type = + check_and_get_data_type<DataTypeArray>(remove_nullable(type).get()) + ->get_nested_type(); + // add sub path without parent prefix + nested_object_ptr->add_sub_column( + subcolumn.path.copy_pop_nfront(entry.first.get_parts().size()), + std::move(flattend_column), std::move(flattend_type)); + } + nested_object = make_nullable(nested_object->get_ptr())->assume_mutable(); + auto array = + make_nullable(ColumnArray::create(std::move(nested_object), std::move(offset))); + PathInDataBuilder builder; + // add parent prefix + builder.append(entry.first.get_parts(), false); + PathInData parent_path = builder.build(); + // unset nested parts + parent_path.unset_nested(); + DCHECK(!parent_path.has_nested_part()); + container_variant.add_sub_column(parent_path, array->assume_mutable(), + ColumnObject::NESTED_TYPE); + } + return Status::OK(); } -Status ExtractReader::seek_to_ordinal(ordinal_t ord) { - CHECK(_root_reader->inited); - return _root_reader->iterator->seek_to_ordinal(ord); -} +Status HierarchicalDataReader::_init_container(vectorized::MutableColumnPtr& container, + size_t nrows) { + using namespace vectorized; + // build variant as container + container = ColumnObject::create(true, false); + auto& container_variant = assert_cast<ColumnObject&>(*container); -Status ExtractReader::extract_to(vectorized::MutableColumnPtr& dst, size_t nrows) { - DCHECK(_root_reader); - DCHECK(_root_reader->inited); - vectorized::ColumnNullable* nullable_column = nullptr; - if (dst->is_nullable()) { - nullable_column = assert_cast<vectorized::ColumnNullable*>(dst.get()); + // add root first + if (_path.get_parts().empty() && _root_reader) { + auto& root_var = + _root_reader->column->is_nullable() + ? assert_cast<vectorized::ColumnObject&>( + assert_cast<vectorized::ColumnNullable&>(*_root_reader->column) + .get_nested_column()) + : assert_cast<vectorized::ColumnObject&>(*_root_reader->column); + auto column = root_var.get_root(); + auto type = root_var.get_root_type(); + container_variant.add_sub_column({}, std::move(column), type); } - auto& variant = - nullable_column == nullptr - ? assert_cast<vectorized::ColumnObject&>(*dst) - : assert_cast<vectorized::ColumnObject&>(nullable_column->get_nested_column()); - const auto& root = - _root_reader->column->is_nullable() - ? assert_cast<vectorized::ColumnObject&>( - assert_cast<vectorized::ColumnNullable&>(*_root_reader->column) - .get_nested_column()) - : assert_cast<const vectorized::ColumnObject&>(*_root_reader->column); - // extract root value with path, we can't modify the original root column - // since some other column may depend on it. - vectorized::MutableColumnPtr extracted_column; - RETURN_IF_ERROR(root.extract_root( // trim the root name, eg. v.a.b -> a.b - _col.path_info_ptr()->copy_pop_front(), extracted_column)); - - if (_target_type_hint != nullptr) { - variant.create_root(_target_type_hint, _target_type_hint->create_column()); + // parent path -> subcolumns + std::map<PathInData, PathsWithColumnAndType> nested_subcolumns; + PathsWithColumnAndType non_nested_subcolumns; + RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) { + MutableColumnPtr column = node.data.column->get_ptr(); + PathInData relative_path = node.path.copy_pop_nfront(_path.get_parts().size()); + + if (node.path.has_nested_part()) { + CHECK_EQ(getTypeName(remove_nullable(node.data.type)->get_type_id()), + getTypeName(TypeIndex::Array)); + PathInData parent_path = + node.path.get_nested_prefix_path().copy_pop_nfront(_path.get_parts().size()); + nested_subcolumns[parent_path].emplace_back(relative_path, column->get_ptr(), + node.data.type); + } else { + non_nested_subcolumns.emplace_back(relative_path, column->get_ptr(), node.data.type); + } + return Status::OK(); + })); + + RETURN_IF_ERROR(_process_sub_columns(container_variant, non_nested_subcolumns)); + + RETURN_IF_ERROR(_process_nested_columns(container_variant, nested_subcolumns)); + + RETURN_IF_ERROR(_process_sparse_column(container_variant, nrows)); + return Status::OK(); +} + +// Return sub-path by specified prefix. +// For example, for prefix a.b: +// a.b.c.d -> c.d, a.b.c -> c +static std::string_view get_sub_path(const std::string_view& path, const std::string_view& prefix) { + return path.substr(prefix.size() + 1); +} + +Status HierarchicalDataReader::_process_sparse_column(vectorized::ColumnObject& container_variant, + size_t nrows) { + using namespace vectorized; + if (!_sparse_column_reader) { + container_variant.get_sparse_column()->assume_mutable()->insert_many_defaults(nrows); + return Status::OK(); } - if (variant.empty() || variant.is_null_root()) { - variant.create_root(root.get_root_type(), std::move(extracted_column)); + // process sparse column + if (_path.get_parts().empty()) { + // directly use sparse column if access root + container_variant.set_sparse_column(_sparse_column_reader->column->get_ptr()); } else { - vectorized::ColumnPtr cast_column; - const auto& expected_type = variant.get_root_type(); - RETURN_IF_ERROR(vectorized::schema_util::cast_column( - {extracted_column->get_ptr(), - vectorized::make_nullable( - std::make_shared<vectorized::ColumnObject::MostCommonType>()), - ""}, - expected_type, &cast_column)); - variant.get_root()->insert_range_from(*cast_column, 0, nrows); - // variant.set_num_rows(variant.get_root()->size()); + const auto& offsets = + assert_cast<const ColumnMap&>(*_sparse_column_reader->column).get_offsets(); + /// Check if there is no data in shared data in current range. + if (offsets.back() == offsets[-1]) { + container_variant.get_sparse_column()->assume_mutable()->insert_many_defaults(nrows); + } else { + // Read for variant sparse column + // Example path: a.b + // data: a.b.c : int|123 + // a.b.d : string|"456" + // a.e.d : string|"789" + // then the extracted sparse column will be: + // c : int|123 + // d : string|"456" + const auto& sparse_data_map = + assert_cast<const ColumnMap&>(*_sparse_column_reader->column); + const auto& src_sparse_data_offsets = sparse_data_map.get_offsets(); + const auto& src_sparse_data_paths = + assert_cast<const ColumnString&>(sparse_data_map.get_keys()); + const auto& src_sparse_data_values = + assert_cast<const ColumnString&>(sparse_data_map.get_values()); + + auto& sparse_data_offsets = + assert_cast<ColumnMap&>( + *container_variant.get_sparse_column()->assume_mutable()) + .get_offsets(); + auto [sparse_data_paths, sparse_data_values] = + container_variant.get_sparse_data_paths_and_values(); + StringRef prefix_ref(_path.get_path()); + std::string_view path_prefix(prefix_ref.data, prefix_ref.size); + for (size_t i = 0; i != src_sparse_data_offsets.size(); ++i) { + size_t start = src_sparse_data_offsets[ssize_t(i) - 1]; + size_t end = src_sparse_data_offsets[ssize_t(i)]; + size_t lower_bound_index = + vectorized::ColumnObject::find_path_lower_bound_in_sparse_data( + prefix_ref, src_sparse_data_paths, start, end); + for (; lower_bound_index != end; ++lower_bound_index) { + auto path_ref = src_sparse_data_paths.get_data_at(lower_bound_index); + std::string_view path(path_ref.data, path_ref.size); + if (!path.starts_with(path_prefix)) { + break; + } + // Don't include path that is equal to the prefix. + if (path.size() != path_prefix.size()) { + auto sub_path = get_sub_path(path, path_prefix); + sparse_data_paths->insert_data(sub_path.data(), sub_path.size()); + sparse_data_values->insert_from(src_sparse_data_values, lower_bound_index); + } + } + sparse_data_offsets.push_back(sparse_data_paths->size()); + } + } } - if (dst->is_nullable()) { - // fill nullmap - vectorized::ColumnUInt8& dst_null_map = - assert_cast<vectorized::ColumnNullable&>(*dst).get_null_map_column(); - vectorized::ColumnUInt8& src_null_map = - assert_cast<vectorized::ColumnNullable&>(*variant.get_root()).get_null_map_column(); - dst_null_map.insert_range_from(src_null_map, 0, src_null_map.size()); + return Status::OK(); +} + +Status HierarchicalDataReader::_init_null_map_and_clear_columns( + vectorized::MutableColumnPtr& container, vectorized::MutableColumnPtr& dst, size_t nrows) { + using namespace vectorized; + // clear data in nodes + RETURN_IF_ERROR(tranverse([&](SubstreamReaderTree::Node& node) { + node.data.column->clear(); + return Status::OK(); + })); + container->clear(); + _sparse_column_reader->column->clear(); + if (_root_reader) { + if (_root_reader->column->is_nullable()) { + // fill nullmap + DCHECK(dst->is_nullable()); + ColumnUInt8& dst_null_map = assert_cast<ColumnNullable&>(*dst).get_null_map_column(); + ColumnUInt8& src_null_map = + assert_cast<ColumnNullable&>(*_root_reader->column).get_null_map_column(); + dst_null_map.insert_range_from(src_null_map, 0, src_null_map.size()); + // clear nullmap and inner data + src_null_map.clear(); + assert_cast<ColumnObject&>( + assert_cast<ColumnNullable&>(*_root_reader->column).get_nested_column()) + .clear_column_data(); + } else { + auto& root_column = assert_cast<ColumnObject&>(*_root_reader->column); + root_column.clear_column_data(); + } + } else { + if (dst->is_nullable()) { + // No nullable info exist in hirearchical data, fill nullmap with all none null + ColumnUInt8& dst_null_map = assert_cast<ColumnNullable&>(*dst).get_null_map_column(); + auto fake_nullable_column = ColumnUInt8::create(nrows, 0); + dst_null_map.insert_range_from(*fake_nullable_column, 0, nrows); + } } - _root_reader->column->clear(); -#ifndef NDEBUG - variant.check_consistency(); -#endif return Status::OK(); } -Status ExtractReader::next_batch(size_t* n, vectorized::MutableColumnPtr& dst, bool* has_null) { - RETURN_IF_ERROR(_root_reader->iterator->next_batch(n, _root_reader->column)); - RETURN_IF_ERROR(extract_to(dst, *n)); +Status SparseColumnExtractReader::init(const ColumnIteratorOptions& opts) { + return _sparse_column_reader->init(opts); +} + +Status SparseColumnExtractReader::seek_to_first() { + return _sparse_column_reader->seek_to_first(); +} + +Status SparseColumnExtractReader::seek_to_ordinal(ordinal_t ord) { + return _sparse_column_reader->seek_to_ordinal(ord); +} + +void SparseColumnExtractReader::_fill_path_column(vectorized::MutableColumnPtr& dst) { + vectorized::ColumnObject& var = + dst->is_nullable() + ? assert_cast<vectorized::ColumnObject&>( + assert_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()) + : assert_cast<vectorized::ColumnObject&>(*dst); + DCHECK(!var.is_null_root()); + vectorized::ColumnObject::fill_path_olumn_from_sparse_data( + *var.get_subcolumn({}) /*root*/, StringRef {_path.data(), _path.size()}, + _sparse_column->get_ptr(), 0, _sparse_column->size()); + _sparse_column->clear(); +} + +Status SparseColumnExtractReader::next_batch(size_t* n, vectorized::MutableColumnPtr& dst, Review Comment: warning: pointer parameter 'n' can be pointer to const [readability-non-const-parameter] ```suggestion Status SparseColumnExtractReader::next_batch(const size_t* n, vectorized::MutableColumnPtr& dst, ``` ########## be/src/vec/columns/column_object.cpp: ########## @@ -1717,89 +1776,203 @@ } Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string* output) const { - if (!is_finalized()) { - const_cast<ColumnObject*>(this)->finalize(); - } - rapidjson::StringBuffer buf; - if (is_scalar_variant()) { + // if (!is_finalized()) { + // const_cast<ColumnObject*>(this)->finalize(); + // } + if (is_scalar_variant() && is_finalized()) { auto type = get_root_type(); *output = type->to_string(*get_root(), row); return Status::OK(); } - RETURN_IF_ERROR(serialize_one_row_to_json_format(row, &buf, nullptr)); - // TODO avoid copy - *output = std::string(buf.GetString(), buf.GetSize()); + // TODO preallocate memory + auto tmp_col = ColumnString::create(); + VectorBufferWriter write_buffer(*tmp_col.get()); + RETURN_IF_ERROR(serialize_one_row_to_json_format(row, write_buffer, nullptr)); + write_buffer.commit(); + auto str_ref = tmp_col->get_data_at(0); + *output = std::string(str_ref.data, str_ref.size); return Status::OK(); } Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable& output) const { - if (!is_finalized()) { - const_cast<ColumnObject*>(this)->finalize(); - } - if (is_scalar_variant()) { + // if (!is_finalized()) { + // const_cast<ColumnObject*>(this)->finalize(); + // } + if (is_scalar_variant() && is_finalized()) { auto type = get_root_type(); type->to_string(*get_root(), row, output); return Status::OK(); } - rapidjson::StringBuffer buf; - RETURN_IF_ERROR(serialize_one_row_to_json_format(row, &buf, nullptr)); - output.write(buf.GetString(), buf.GetLength()); + RETURN_IF_ERROR(serialize_one_row_to_json_format(row, output, nullptr)); return Status::OK(); } -Status ColumnObject::serialize_one_row_to_json_format(int64_t row, rapidjson::StringBuffer* output, - bool* is_null) const { - CHECK(is_finalized()); - if (subcolumns.empty()) { - if (is_null != nullptr) { - *is_null = true; - } else { - rapidjson::Value root(rapidjson::kNullType); - rapidjson::Writer<rapidjson::StringBuffer> writer(*output); - if (!root.Accept(writer)) { - return Status::InternalError("Failed to serialize json value"); +/// Struct that represents elements of the JSON path. +/// "a.b.c" -> ["a", "b", "c"] +struct PathElements { + explicit PathElements(const String& path) { + const char* start = path.data(); + const char* end = start + path.size(); + const char* pos = start; + const char* last_dot_pos = pos - 1; + for (pos = start; pos != end; ++pos) { + if (*pos == '.') { + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); + last_dot_pos = pos; } } - return Status::OK(); + + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); } - CHECK(size() > row); - rapidjson::StringBuffer buffer; - rapidjson::Value root(rapidjson::kNullType); - if (doc_structure == nullptr) { - doc_structure = std::make_shared<rapidjson::Document>(); - rapidjson::Document::AllocatorType& allocator = doc_structure->GetAllocator(); - get_json_by_column_tree(*doc_structure, allocator, subcolumns.get_root()); + + size_t size() const { return elements.size(); } + + std::vector<std::string_view> elements; +}; + +/// Struct that represents a prefix of a JSON path. Used during output of the JSON object. +struct Prefix { + /// Shrink current prefix to the common prefix of current prefix and specified path. + /// For example, if current prefix is a.b.c.d and path is a.b.e, then shrink the prefix to a.b. + void shrink_to_common_prefix(const PathElements& path_elements) { + /// Don't include last element in path_elements in the prefix. + size_t i = 0; + while (i != elements.size() && i != (path_elements.elements.size() - 1) && + elements[i].first == path_elements.elements[i]) + ++i; + elements.resize(i); } - if (!doc_structure->IsNull()) { - root.CopyFrom(*doc_structure, doc_structure->GetAllocator()); + + /// Check is_first flag in current object. + bool is_first_in_current_object() const { + if (elements.empty()) return root_is_first_flag; + return elements.back().second; } - Arena mem_pool; -#ifndef NDEBUG - VLOG_DEBUG << "dump structure " << JsonFunctions::print_json_value(*doc_structure); -#endif - for (const auto& subcolumn : subcolumns) { - RETURN_IF_ERROR(find_and_set_leave_value( - subcolumn->data.get_finalized_column_ptr(), subcolumn->path, - subcolumn->data.get_least_common_type_serde(), - subcolumn->data.get_least_common_type(), - subcolumn->data.least_common_type.get_base_type_id(), root, - doc_structure->GetAllocator(), mem_pool, row)); - if (subcolumn->path.empty() && !root.IsObject()) { - // root was modified, only handle root node - break; - } + + /// Set flag is_first = false in current object. + void set_not_first_in_current_object() { + if (elements.empty()) + root_is_first_flag = false; + else + elements.back().second = false; Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion else { e }lements.back().second = false; ``` ########## be/src/vec/columns/column_object.cpp: ########## @@ -1717,89 +1776,203 @@ } Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string* output) const { - if (!is_finalized()) { - const_cast<ColumnObject*>(this)->finalize(); - } - rapidjson::StringBuffer buf; - if (is_scalar_variant()) { + // if (!is_finalized()) { + // const_cast<ColumnObject*>(this)->finalize(); + // } + if (is_scalar_variant() && is_finalized()) { auto type = get_root_type(); *output = type->to_string(*get_root(), row); return Status::OK(); } - RETURN_IF_ERROR(serialize_one_row_to_json_format(row, &buf, nullptr)); - // TODO avoid copy - *output = std::string(buf.GetString(), buf.GetSize()); + // TODO preallocate memory + auto tmp_col = ColumnString::create(); + VectorBufferWriter write_buffer(*tmp_col.get()); + RETURN_IF_ERROR(serialize_one_row_to_json_format(row, write_buffer, nullptr)); + write_buffer.commit(); + auto str_ref = tmp_col->get_data_at(0); + *output = std::string(str_ref.data, str_ref.size); return Status::OK(); } Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable& output) const { - if (!is_finalized()) { - const_cast<ColumnObject*>(this)->finalize(); - } - if (is_scalar_variant()) { + // if (!is_finalized()) { + // const_cast<ColumnObject*>(this)->finalize(); + // } + if (is_scalar_variant() && is_finalized()) { auto type = get_root_type(); type->to_string(*get_root(), row, output); return Status::OK(); } - rapidjson::StringBuffer buf; - RETURN_IF_ERROR(serialize_one_row_to_json_format(row, &buf, nullptr)); - output.write(buf.GetString(), buf.GetLength()); + RETURN_IF_ERROR(serialize_one_row_to_json_format(row, output, nullptr)); return Status::OK(); } -Status ColumnObject::serialize_one_row_to_json_format(int64_t row, rapidjson::StringBuffer* output, - bool* is_null) const { - CHECK(is_finalized()); - if (subcolumns.empty()) { - if (is_null != nullptr) { - *is_null = true; - } else { - rapidjson::Value root(rapidjson::kNullType); - rapidjson::Writer<rapidjson::StringBuffer> writer(*output); - if (!root.Accept(writer)) { - return Status::InternalError("Failed to serialize json value"); +/// Struct that represents elements of the JSON path. +/// "a.b.c" -> ["a", "b", "c"] +struct PathElements { + explicit PathElements(const String& path) { + const char* start = path.data(); + const char* end = start + path.size(); + const char* pos = start; + const char* last_dot_pos = pos - 1; + for (pos = start; pos != end; ++pos) { + if (*pos == '.') { + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); + last_dot_pos = pos; } } - return Status::OK(); + + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); } - CHECK(size() > row); - rapidjson::StringBuffer buffer; - rapidjson::Value root(rapidjson::kNullType); - if (doc_structure == nullptr) { - doc_structure = std::make_shared<rapidjson::Document>(); - rapidjson::Document::AllocatorType& allocator = doc_structure->GetAllocator(); - get_json_by_column_tree(*doc_structure, allocator, subcolumns.get_root()); + + size_t size() const { return elements.size(); } + + std::vector<std::string_view> elements; +}; + +/// Struct that represents a prefix of a JSON path. Used during output of the JSON object. +struct Prefix { + /// Shrink current prefix to the common prefix of current prefix and specified path. + /// For example, if current prefix is a.b.c.d and path is a.b.e, then shrink the prefix to a.b. + void shrink_to_common_prefix(const PathElements& path_elements) { + /// Don't include last element in path_elements in the prefix. + size_t i = 0; + while (i != elements.size() && i != (path_elements.elements.size() - 1) && + elements[i].first == path_elements.elements[i]) + ++i; + elements.resize(i); } - if (!doc_structure->IsNull()) { - root.CopyFrom(*doc_structure, doc_structure->GetAllocator()); + + /// Check is_first flag in current object. + bool is_first_in_current_object() const { + if (elements.empty()) return root_is_first_flag; Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion if (elements.empty()) { return root_is_first_flag; } ``` ########## be/src/vec/columns/column_object.cpp: ########## @@ -1717,89 +1776,203 @@ } Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string* output) const { - if (!is_finalized()) { - const_cast<ColumnObject*>(this)->finalize(); - } - rapidjson::StringBuffer buf; - if (is_scalar_variant()) { + // if (!is_finalized()) { + // const_cast<ColumnObject*>(this)->finalize(); + // } + if (is_scalar_variant() && is_finalized()) { auto type = get_root_type(); *output = type->to_string(*get_root(), row); return Status::OK(); } - RETURN_IF_ERROR(serialize_one_row_to_json_format(row, &buf, nullptr)); - // TODO avoid copy - *output = std::string(buf.GetString(), buf.GetSize()); + // TODO preallocate memory + auto tmp_col = ColumnString::create(); + VectorBufferWriter write_buffer(*tmp_col.get()); + RETURN_IF_ERROR(serialize_one_row_to_json_format(row, write_buffer, nullptr)); + write_buffer.commit(); + auto str_ref = tmp_col->get_data_at(0); + *output = std::string(str_ref.data, str_ref.size); return Status::OK(); } Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable& output) const { - if (!is_finalized()) { - const_cast<ColumnObject*>(this)->finalize(); - } - if (is_scalar_variant()) { + // if (!is_finalized()) { + // const_cast<ColumnObject*>(this)->finalize(); + // } + if (is_scalar_variant() && is_finalized()) { auto type = get_root_type(); type->to_string(*get_root(), row, output); return Status::OK(); } - rapidjson::StringBuffer buf; - RETURN_IF_ERROR(serialize_one_row_to_json_format(row, &buf, nullptr)); - output.write(buf.GetString(), buf.GetLength()); + RETURN_IF_ERROR(serialize_one_row_to_json_format(row, output, nullptr)); return Status::OK(); } -Status ColumnObject::serialize_one_row_to_json_format(int64_t row, rapidjson::StringBuffer* output, - bool* is_null) const { - CHECK(is_finalized()); - if (subcolumns.empty()) { - if (is_null != nullptr) { - *is_null = true; - } else { - rapidjson::Value root(rapidjson::kNullType); - rapidjson::Writer<rapidjson::StringBuffer> writer(*output); - if (!root.Accept(writer)) { - return Status::InternalError("Failed to serialize json value"); +/// Struct that represents elements of the JSON path. +/// "a.b.c" -> ["a", "b", "c"] +struct PathElements { + explicit PathElements(const String& path) { + const char* start = path.data(); + const char* end = start + path.size(); + const char* pos = start; + const char* last_dot_pos = pos - 1; + for (pos = start; pos != end; ++pos) { + if (*pos == '.') { + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); + last_dot_pos = pos; } } - return Status::OK(); + + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); } - CHECK(size() > row); - rapidjson::StringBuffer buffer; - rapidjson::Value root(rapidjson::kNullType); - if (doc_structure == nullptr) { - doc_structure = std::make_shared<rapidjson::Document>(); - rapidjson::Document::AllocatorType& allocator = doc_structure->GetAllocator(); - get_json_by_column_tree(*doc_structure, allocator, subcolumns.get_root()); + + size_t size() const { return elements.size(); } + + std::vector<std::string_view> elements; +}; + +/// Struct that represents a prefix of a JSON path. Used during output of the JSON object. +struct Prefix { + /// Shrink current prefix to the common prefix of current prefix and specified path. + /// For example, if current prefix is a.b.c.d and path is a.b.e, then shrink the prefix to a.b. + void shrink_to_common_prefix(const PathElements& path_elements) { + /// Don't include last element in path_elements in the prefix. + size_t i = 0; + while (i != elements.size() && i != (path_elements.elements.size() - 1) && + elements[i].first == path_elements.elements[i]) + ++i; + elements.resize(i); } - if (!doc_structure->IsNull()) { - root.CopyFrom(*doc_structure, doc_structure->GetAllocator()); + + /// Check is_first flag in current object. + bool is_first_in_current_object() const { + if (elements.empty()) return root_is_first_flag; + return elements.back().second; } - Arena mem_pool; -#ifndef NDEBUG - VLOG_DEBUG << "dump structure " << JsonFunctions::print_json_value(*doc_structure); -#endif - for (const auto& subcolumn : subcolumns) { - RETURN_IF_ERROR(find_and_set_leave_value( - subcolumn->data.get_finalized_column_ptr(), subcolumn->path, - subcolumn->data.get_least_common_type_serde(), - subcolumn->data.get_least_common_type(), - subcolumn->data.least_common_type.get_base_type_id(), root, - doc_structure->GetAllocator(), mem_pool, row)); - if (subcolumn->path.empty() && !root.IsObject()) { - // root was modified, only handle root node - break; - } + + /// Set flag is_first = false in current object. + void set_not_first_in_current_object() { + if (elements.empty()) + root_is_first_flag = false; + else + elements.back().second = false; } - compact_null_values(root, doc_structure->GetAllocator()); - if (root.IsNull() && is_null != nullptr) { - // Fast path - *is_null = true; - } else { - output->Clear(); - rapidjson::Writer<rapidjson::StringBuffer> writer(*output); - if (!root.Accept(writer)) { - return Status::InternalError("Failed to serialize json value"); + + size_t size() const { return elements.size(); } + + /// Elements of the prefix: (path element, is_first flag in this prefix). + /// is_first flag indicates if we already serialized some key in the object with such prefix. + std::vector<std::pair<std::string_view, bool>> elements; + bool root_is_first_flag = true; +}; + +Status ColumnObject::serialize_one_row_to_json_format(int64_t row_num, BufferWritable& output, Review Comment: warning: function 'serialize_one_row_to_json_format' exceeds recommended size/complexity thresholds [readability-function-size] ```cpp Status ColumnObject::serialize_one_row_to_json_format(int64_t row_num, BufferWritable& output, ^ ``` <details> <summary>Additional context</summary> **be/src/vec/columns/column_object.cpp:1867:** 108 lines including whitespace and comments (threshold 80) ```cpp Status ColumnObject::serialize_one_row_to_json_format(int64_t row_num, BufferWritable& output, ^ ``` </details> ########## be/src/vec/columns/column_object.cpp: ########## @@ -1717,89 +1776,203 @@ } Status ColumnObject::serialize_one_row_to_string(int64_t row, std::string* output) const { - if (!is_finalized()) { - const_cast<ColumnObject*>(this)->finalize(); - } - rapidjson::StringBuffer buf; - if (is_scalar_variant()) { + // if (!is_finalized()) { + // const_cast<ColumnObject*>(this)->finalize(); + // } + if (is_scalar_variant() && is_finalized()) { auto type = get_root_type(); *output = type->to_string(*get_root(), row); return Status::OK(); } - RETURN_IF_ERROR(serialize_one_row_to_json_format(row, &buf, nullptr)); - // TODO avoid copy - *output = std::string(buf.GetString(), buf.GetSize()); + // TODO preallocate memory + auto tmp_col = ColumnString::create(); + VectorBufferWriter write_buffer(*tmp_col.get()); + RETURN_IF_ERROR(serialize_one_row_to_json_format(row, write_buffer, nullptr)); + write_buffer.commit(); + auto str_ref = tmp_col->get_data_at(0); + *output = std::string(str_ref.data, str_ref.size); return Status::OK(); } Status ColumnObject::serialize_one_row_to_string(int64_t row, BufferWritable& output) const { - if (!is_finalized()) { - const_cast<ColumnObject*>(this)->finalize(); - } - if (is_scalar_variant()) { + // if (!is_finalized()) { + // const_cast<ColumnObject*>(this)->finalize(); + // } + if (is_scalar_variant() && is_finalized()) { auto type = get_root_type(); type->to_string(*get_root(), row, output); return Status::OK(); } - rapidjson::StringBuffer buf; - RETURN_IF_ERROR(serialize_one_row_to_json_format(row, &buf, nullptr)); - output.write(buf.GetString(), buf.GetLength()); + RETURN_IF_ERROR(serialize_one_row_to_json_format(row, output, nullptr)); return Status::OK(); } -Status ColumnObject::serialize_one_row_to_json_format(int64_t row, rapidjson::StringBuffer* output, - bool* is_null) const { - CHECK(is_finalized()); - if (subcolumns.empty()) { - if (is_null != nullptr) { - *is_null = true; - } else { - rapidjson::Value root(rapidjson::kNullType); - rapidjson::Writer<rapidjson::StringBuffer> writer(*output); - if (!root.Accept(writer)) { - return Status::InternalError("Failed to serialize json value"); +/// Struct that represents elements of the JSON path. +/// "a.b.c" -> ["a", "b", "c"] +struct PathElements { + explicit PathElements(const String& path) { + const char* start = path.data(); + const char* end = start + path.size(); + const char* pos = start; + const char* last_dot_pos = pos - 1; + for (pos = start; pos != end; ++pos) { + if (*pos == '.') { + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); + last_dot_pos = pos; } } - return Status::OK(); + + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); } - CHECK(size() > row); - rapidjson::StringBuffer buffer; - rapidjson::Value root(rapidjson::kNullType); - if (doc_structure == nullptr) { - doc_structure = std::make_shared<rapidjson::Document>(); - rapidjson::Document::AllocatorType& allocator = doc_structure->GetAllocator(); - get_json_by_column_tree(*doc_structure, allocator, subcolumns.get_root()); + + size_t size() const { return elements.size(); } + + std::vector<std::string_view> elements; +}; + +/// Struct that represents a prefix of a JSON path. Used during output of the JSON object. +struct Prefix { + /// Shrink current prefix to the common prefix of current prefix and specified path. + /// For example, if current prefix is a.b.c.d and path is a.b.e, then shrink the prefix to a.b. + void shrink_to_common_prefix(const PathElements& path_elements) { + /// Don't include last element in path_elements in the prefix. + size_t i = 0; + while (i != elements.size() && i != (path_elements.elements.size() - 1) && + elements[i].first == path_elements.elements[i]) + ++i; + elements.resize(i); } - if (!doc_structure->IsNull()) { - root.CopyFrom(*doc_structure, doc_structure->GetAllocator()); + + /// Check is_first flag in current object. + bool is_first_in_current_object() const { + if (elements.empty()) return root_is_first_flag; + return elements.back().second; } - Arena mem_pool; -#ifndef NDEBUG - VLOG_DEBUG << "dump structure " << JsonFunctions::print_json_value(*doc_structure); -#endif - for (const auto& subcolumn : subcolumns) { - RETURN_IF_ERROR(find_and_set_leave_value( - subcolumn->data.get_finalized_column_ptr(), subcolumn->path, - subcolumn->data.get_least_common_type_serde(), - subcolumn->data.get_least_common_type(), - subcolumn->data.least_common_type.get_base_type_id(), root, - doc_structure->GetAllocator(), mem_pool, row)); - if (subcolumn->path.empty() && !root.IsObject()) { - // root was modified, only handle root node - break; - } + + /// Set flag is_first = false in current object. + void set_not_first_in_current_object() { + if (elements.empty()) Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion if (elements.empty()) { ``` be/src/vec/columns/column_object.cpp:1855: ```diff - else + } else ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org