github-actions[bot] commented on code in PR #24554: URL: https://github.com/apache/doris/pull/24554#discussion_r1349459421
########## be/src/vec/columns/column.h: ########## @@ -144,15 +144,19 @@ class IColumn : public COW<IColumn> { return nullptr; } + /// Some columns may require finalization before using of other operations. + virtual void finalize() {} + + MutablePtr clone_finalized() const { Review Comment: warning: function 'clone_finalized' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] MutablePtr clone_finalized() const { ``` ########## be/src/vec/columns/column.h: ########## @@ -603,6 +607,8 @@ virtual bool is_hll() const { return false; } + virtual bool is_variant() const { return false; } Review Comment: warning: function 'is_variant' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] virtual bool is_variant() const { return false; } ``` ########## be/src/vec/columns/column_array.h: ########## @@ -270,6 +269,8 @@ class ColumnArray final : public COWHelper<IColumn, ColumnArray> { ColumnPtr index(const IColumn& indexes, size_t limit) const override; + double get_ratio_of_default_rows(double sample_ratio) const override; Review Comment: warning: function 'get_ratio_of_default_rows' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] double get_ratio_of_default_rows(double sample_ratio) const override; ``` ########## be/src/vec/columns/column_dictionary.h: ########## @@ -114,14 +114,11 @@ class ColumnDictionary final : public COWHelper<IColumn, ColumnDictionary<T>> { void reserve(size_t n) override { _codes.reserve(n); } - [[noreturn]] TypeIndex get_data_type() const override { - LOG(FATAL) << "ColumnDictionary get_data_type not implemeted"; - } - const char* get_family_name() const override { return "ColumnDictionary"; } - [[noreturn]] MutableColumnPtr clone_resized(size_t size) const override { - LOG(FATAL) << "clone_resized not supported in ColumnDictionary"; + MutableColumnPtr clone_resized(size_t size) const override { Review Comment: warning: function 'clone_resized' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] MutableColumnPtr clone_resized(size_t size) const override { ``` ########## be/src/vec/columns/column_object.cpp: ########## @@ -858,28 +905,318 @@ bool ColumnObject::is_finalized() const { [](const auto& entry) { return entry->data.is_finalized(); }); } -void ColumnObject::finalize() { +static bool check_if_valid_column_name(const PathInData& path) { + static const std::regex COLUMN_NAME_REGEX("^[_a-zA-Z@0-9][.a-zA-Z0-9_+-/><?@#$%^&*]{0,255}$"); + return std::regex_match(path.get_path(), COLUMN_NAME_REGEX); +} + +void ColumnObject::Subcolumn::wrapp_array_nullable() { + // Wrap array with nullable, treat empty array as null to elimate conflict at present + auto& result_column = get_finalized_column_ptr(); + if (result_column->is_column_array() && !result_column->is_nullable()) { + auto new_null_map = ColumnUInt8::create(); + new_null_map->reserve(result_column->size()); + auto& null_map_data = new_null_map->get_data(); + auto array = static_cast<const ColumnArray*>(result_column.get()); + for (size_t i = 0; i < array->size(); ++i) { + null_map_data.push_back(array->is_default_at(i)); + } + result_column = ColumnNullable::create(std::move(result_column), std::move(new_null_map)); + data_types[0] = make_nullable(data_types[0]); + least_common_type = LeastCommonType {data_types[0]}; + } +} + +rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInData& path, + int idx = 0) { + if (idx >= path.get_parts().size()) { + return &json; + } + + std::string_view current_key = path.get_parts()[idx].key; + if (!json.IsObject()) { + return nullptr; + } + rapidjson::Value name(current_key.data(), current_key.size()); + auto it = json.FindMember(name); + if (it == json.MemberEnd()) { + return nullptr; + } + rapidjson::Value& current = it->value; + // if (idx == path.get_parts().size() - 1) { + // return ¤t; + // } + return find_leaf_node_by_path(current, path, idx + 1); +} + +void find_and_set_leave_value(const IColumn* column, const PathInData& path, + const DataTypeSerDeSPtr& type, rapidjson::Value& root, + rapidjson::Document::AllocatorType& allocator, int row) { + const auto* nullable = assert_cast<const ColumnNullable*>(column); + if (nullable->is_null_at(row)) { + return; + } + // TODO could cache the result of leaf nodes with it's path info + rapidjson::Value* target = find_leaf_node_by_path(root, path); + if (UNLIKELY(!target)) { + rapidjson::StringBuffer buffer; + rapidjson::Writer<rapidjson::StringBuffer> writer(buffer); + root.Accept(writer); + LOG(FATAL) << "could not find path " << path.get_path() + << ", root: " << std::string(buffer.GetString(), buffer.GetSize()); + } + type->write_one_cell_to_json(*column, *target, allocator, row); +} + +// compact null values +// {"a" : {"b" : "d" {"n" : null}, "e" : null}, "c" : 10 } +// after compact -> {"a" : {"c"} : 10} +void compact_null_values(rapidjson::Value& json, rapidjson::Document::AllocatorType& allocator) { + if (!json.IsObject() || json.IsNull()) { + return; + } + + rapidjson::Value::MemberIterator it = json.MemberBegin(); + while (it != json.MemberEnd()) { + rapidjson::Value& value = it->value; + if (value.IsNull()) { + it = json.EraseMember(it); + continue; + } + compact_null_values(value, allocator); + if (value.IsObject() && value.ObjectEmpty()) { + it = json.EraseMember(it); + continue; + } + ++it; + } +} + +// Construct rapidjson value from Subcolumns +void get_json_by_column_tree(rapidjson::Value& root, rapidjson::Document::AllocatorType& allocator, + const ColumnObject::Subcolumns::Node* node_root) { + if (node_root == nullptr || node_root->children.empty()) { + root.SetNull(); + return; + } + root.SetObject(); + for (auto it = node_root->children.begin(); it != node_root->children.end(); ++it) { + auto child = it->get_second(); + rapidjson::Value value(rapidjson::kObjectType); + get_json_by_column_tree(value, allocator, child.get()); + root.AddMember(rapidjson::StringRef(it->get_first().data, it->get_first().size), value, + allocator); + } +} + +bool ColumnObject::serialize_one_row_to_string(int row, std::string* output) const { + if (!is_finalized()) { + const_cast<ColumnObject*>(this)->finalize(); + } + rapidjson::StringBuffer buf; + if (is_scalar_variant()) { + auto type = get_root_type(); + *output = type->to_string(*get_root(), row); + return true; + } + bool res = serialize_one_row_to_json_format(row, &buf, nullptr); + if (res) { + // TODO avoid copy + *output = std::string(buf.GetString(), buf.GetSize()); + } + return res; +} + +bool ColumnObject::serialize_one_row_to_string(int row, BufferWritable& output) const { + if (!is_finalized()) { + const_cast<ColumnObject*>(this)->finalize(); + } + if (is_scalar_variant()) { + auto type = get_root_type(); + type->to_string(*get_root(), row, output); + return true; + } + rapidjson::StringBuffer buf; + bool res = serialize_one_row_to_json_format(row, &buf, nullptr); + if (res) { + output.write(buf.GetString(), buf.GetLength()); + } + return res; +} + +bool ColumnObject::serialize_one_row_to_json_format(int row, rapidjson::StringBuffer* output, + bool* is_null) const { + CHECK(is_finalized()); + if (subcolumns.empty()) { + if (is_null != nullptr) { + *is_null = true; + } else { + rapidjson::Value root(rapidjson::kNullType); + rapidjson::Writer<rapidjson::StringBuffer> writer(*output); + return root.Accept(writer); + } + return true; + } + CHECK(size() > row); + rapidjson::StringBuffer buffer; + rapidjson::Value root(rapidjson::kNullType); + if (doc_structure == nullptr) { + doc_structure = std::make_shared<rapidjson::Document>(); + rapidjson::Document::AllocatorType& allocator = doc_structure->GetAllocator(); + get_json_by_column_tree(*doc_structure, allocator, subcolumns.get_root()); + } + if (!doc_structure->IsNull()) { + root.CopyFrom(*doc_structure, doc_structure->GetAllocator()); + } +#ifndef NDEBUG + VLOG_DEBUG << "dump structure " << JsonFunctions::print_json_value(*doc_structure); +#endif + for (const auto& subcolumn : subcolumns) { + find_and_set_leave_value(subcolumn->data.get_finalized_column_ptr(), subcolumn->path, + subcolumn->data.get_least_common_type_serde(), root, + doc_structure->GetAllocator(), row); + } + compact_null_values(root, doc_structure->GetAllocator()); + if (root.IsNull() && is_null != nullptr) { + // Fast path + *is_null = true; + } else { + output->Clear(); + rapidjson::Writer<rapidjson::StringBuffer> writer(*output); + return root.Accept(writer); + } + return true; +} + +void ColumnObject::merge_sparse_to_root_column() { Review Comment: warning: method 'merge_sparse_to_root_column' can be made const [readability-make-member-function-const] be/src/vec/columns/column_object.h:254: ```diff - void merge_sparse_to_root_column(); + void merge_sparse_to_root_column() const; ``` ```suggestion void ColumnObject::merge_sparse_to_root_column() const { ``` ########## be/src/vec/columns/column_object.cpp: ########## @@ -939,10 +1320,85 @@ num_rows = target_num_rows; } +void ColumnObject::create_root() { + auto type = is_nullable ? make_nullable(std::make_shared<MostCommonType>()) + : std::make_shared<MostCommonType>(); + add_sub_column({}, type->create_column(), type); +} + +void ColumnObject::create_root(const DataTypePtr& type, MutableColumnPtr&& column) { + if (num_rows == 0) { + num_rows = column->size(); + } + add_sub_column({}, std::move(column), type); +} + +bool ColumnObject::is_null_root() const { + auto* root = subcolumns.get_root(); + if (root == nullptr) { + return true; + } + if (root->data.num_of_defaults_in_prefix == 0 && + (root->data.data.empty() || is_nothing(root->data.get_least_common_type()))) { + return true; + } + return false; +} + +bool ColumnObject::is_scalar_variant() const { + // Only root itself + return !is_null_root() && subcolumns.get_leaves().size() == 1; +} + +DataTypePtr ColumnObject::get_root_type() const { + return subcolumns.get_root()->data.get_least_common_type(); +} + +#define SANITIZE_ROOT() \ + if (is_null_root()) { \ + return Status::InternalError("No root column, path {}", path.get_path()); \ + } \ + if (!WhichDataType(remove_nullable(subcolumns.get_root()->data.get_least_common_type())) \ + .is_json()) { \ + return Status::InternalError( \ + "Root column is not jsonb type but {}, path {}", \ + subcolumns.get_root()->data.get_least_common_type()->get_name(), path.get_path()); \ + } + Review Comment: warning: method 'extract_root' can be made const [readability-make-member-function-const] be/src/vec/columns/column_object.h:457: ```diff - Status extract_root(const PathInData& path); + Status extract_root(const PathInData& path) const; ``` ```suggestion Status ColumnObject::extract_root(const PathInData& path) const { ``` ########## be/src/vec/columns/column_object.h: ########## @@ -126,12 +141,12 @@ /// creates a single column that stores all values. void finalize(); + bool check_if_sparse_column(size_t num_rows); + /// Returns last inserted field. Field get_last_field() const; - /// Recreates subcolumn with default scalar values and keeps sizes of arrays. - /// Used to create columns of type Nested with consistent array sizes. - Subcolumn recreate_with_default_values(const FieldInfo& field_info) const; + FieldInfo get_subcolumn_field_info() const; Review Comment: warning: function 'get_subcolumn_field_info' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] FieldInfo get_subcolumn_field_info() const; ``` ########## be/src/vec/columns/column_object.h: ########## @@ -188,23 +215,68 @@ const bool is_nullable; Subcolumns subcolumns; size_t num_rows; + // sparse columns will be merge and encoded into root column + Subcolumns sparse_columns; + // The rapidjson document format of Subcolumns tree structure + // the leaves is null.In order to display whole document, copy + // this structure and fill with Subcolumns sub items + mutable std::shared_ptr<rapidjson::Document> doc_structure; public: static constexpr auto COLUMN_NAME_DUMMY = "_dummy"; - explicit ColumnObject(bool is_nullable_); + explicit ColumnObject(bool is_nullable_, bool create_root = true); ColumnObject(Subcolumns&& subcolumns_, bool is_nullable_); ~ColumnObject() override = default; - bool can_be_inside_nullable() const override { return true; } + bool can_be_inside_nullable() const override { return false; } Review Comment: warning: function 'can_be_inside_nullable' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] bool can_be_inside_nullable() const override { return false; } ``` ########## be/src/vec/columns/column_object.h: ########## @@ -154,18 +173,22 @@ const DataTypePtr& get() const { return type; } - const DataTypePtr& getBase() const { return base_type; } + const DataTypePtr& get_base() const { return base_type; } size_t get_dimensions() const { return num_dimensions; } void remove_nullable() { type = doris::vectorized::remove_nullable(type); } + const DataTypeSerDeSPtr& get_serde() const { return least_common_type_serder; } Review Comment: warning: function 'get_serde' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] const DataTypeSerDeSPtr& get_serde() const { return least_common_type_serder; } ``` ########## be/src/vec/columns/column.h: ########## @@ -654,6 +660,18 @@ return 0; } + /// Returns ratio of values in column, that are equal to default value of column. + /// Checks only @sample_ratio ratio of rows. + virtual double get_ratio_of_default_rows(double sample_ratio = 1.0) const { + LOG(FATAL) << fmt::format("get_ratio_of_default_rows of column {} are not implemented.", + get_name()); + return 0.0; + } + + /// Template is to devirtualize calls to 'isDefaultAt' method. + template <typename Derived> + double get_ratio_of_default_rows_impl(double sample_ratio) const; Review Comment: warning: function 'get_ratio_of_default_rows_impl' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] double get_ratio_of_default_rows_impl(double sample_ratio) const; ``` ########## be/src/vec/columns/column_impl.h: ########## @@ -86,4 +86,33 @@ void IColumn::get_indices_of_non_default_rows_impl(IColumn::Offsets64& indices, } } +template <typename Derived> +double IColumn::get_ratio_of_default_rows_impl(double sample_ratio) const { + if (sample_ratio <= 0.0 || sample_ratio > 1.0) { + LOG(FATAL) << "Value of 'sample_ratio' must be in interval (0.0; 1.0], but got: " + << sample_ratio; + } + static constexpr auto max_number_of_rows_for_full_search = 1000; + size_t num_rows = size(); + size_t num_sampled_rows = std::min(static_cast<size_t>(num_rows * sample_ratio), num_rows); + size_t num_checked_rows = 0; + size_t res = 0; + if (num_sampled_rows == num_rows || num_rows <= max_number_of_rows_for_full_search) { + for (size_t i = 0; i < num_rows; ++i) + res += static_cast<const Derived&>(*this).is_default_at(i); Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion for (size_t i = 0; i < num_rows; ++i) { res += static_cast<const Derived&>(*this).is_default_at(i); } ``` ########## be/src/vec/columns/column_const.h: ########## @@ -130,6 +128,8 @@ class ColumnConst final : public COWHelper<IColumn, ColumnConst> { void pop_back(size_t n) override { s -= n; } + bool can_be_inside_nullable() const override { return true; } Review Comment: warning: function 'can_be_inside_nullable' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] bool can_be_inside_nullable() const override { return true; } ``` ########## be/src/vec/columns/column.h: ########## @@ -654,6 +660,18 @@ return 0; } + /// Returns ratio of values in column, that are equal to default value of column. + /// Checks only @sample_ratio ratio of rows. + virtual double get_ratio_of_default_rows(double sample_ratio = 1.0) const { Review Comment: warning: function 'get_ratio_of_default_rows' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] virtual double get_ratio_of_default_rows(double sample_ratio = 1.0) const { ``` ########## be/src/vec/columns/column_nullable.h: ########## @@ -366,6 +365,10 @@ return get_ptr(); } + double get_ratio_of_default_rows(double sample_ratio) const override { Review Comment: warning: function 'get_ratio_of_default_rows' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] double get_ratio_of_default_rows(double sample_ratio) const override { ``` ########## be/src/vec/columns/column_nullable.h: ########## @@ -102,6 +102,7 @@ class ColumnNullable final : public COWHelper<IColumn, ColumnNullable> { bool is_null_at(size_t n) const override { return assert_cast<const ColumnUInt8&>(*null_map).get_data()[n] != 0; } + bool is_default_at(size_t n) const override { return is_null_at(n); } Review Comment: warning: function 'is_default_at' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] bool is_default_at(size_t n) const override { return is_null_at(n); } ``` ########## be/src/vec/columns/column_nothing.h: ########## @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/AggregateFunctions/ColumnNothing.h +// and modified by Doris + +#pragma once + +#include "vec/columns/column_dummy.h" + +namespace doris::vectorized { + +class ColumnNothing final : public COWHelper<IColumnDummy, ColumnNothing> { +private: + friend class COWHelper<IColumnDummy, ColumnNothing>; + + ColumnNothing(size_t s_) { s = s_; } + + ColumnNothing(const ColumnNothing&) = default; + +public: + const char* get_family_name() const override { return "Nothing"; } + MutableColumnPtr clone_dummy(size_t s_) const override { return ColumnNothing::create(s_); } Review Comment: warning: function 'clone_dummy' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] MutableColumnPtr clone_dummy(size_t s_) const override { return ColumnNothing::create(s_); } ``` ########## be/src/vec/columns/column_nothing.h: ########## @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/AggregateFunctions/ColumnNothing.h +// and modified by Doris + +#pragma once + +#include "vec/columns/column_dummy.h" + +namespace doris::vectorized { + +class ColumnNothing final : public COWHelper<IColumnDummy, ColumnNothing> { +private: + friend class COWHelper<IColumnDummy, ColumnNothing>; + + ColumnNothing(size_t s_) { s = s_; } + + ColumnNothing(const ColumnNothing&) = default; + +public: + const char* get_family_name() const override { return "Nothing"; } + MutableColumnPtr clone_dummy(size_t s_) const override { return ColumnNothing::create(s_); } + + bool can_be_inside_nullable() const override { return true; } + + bool structure_equals(const IColumn& rhs) const override { Review Comment: warning: function 'structure_equals' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] bool structure_equals(const IColumn& rhs) const override { ``` ########## be/src/vec/columns/column_nothing.h: ########## @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/AggregateFunctions/ColumnNothing.h +// and modified by Doris + +#pragma once + +#include "vec/columns/column_dummy.h" + +namespace doris::vectorized { + +class ColumnNothing final : public COWHelper<IColumnDummy, ColumnNothing> { +private: + friend class COWHelper<IColumnDummy, ColumnNothing>; + + ColumnNothing(size_t s_) { s = s_; } + + ColumnNothing(const ColumnNothing&) = default; + +public: + const char* get_family_name() const override { return "Nothing"; } Review Comment: warning: function 'get_family_name' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] const char* get_family_name() const override { return "Nothing"; } ``` ########## be/src/vec/columns/column_nothing.h: ########## @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/AggregateFunctions/ColumnNothing.h +// and modified by Doris + +#pragma once + +#include "vec/columns/column_dummy.h" + +namespace doris::vectorized { + +class ColumnNothing final : public COWHelper<IColumnDummy, ColumnNothing> { +private: + friend class COWHelper<IColumnDummy, ColumnNothing>; + + ColumnNothing(size_t s_) { s = s_; } + + ColumnNothing(const ColumnNothing&) = default; + +public: + const char* get_family_name() const override { return "Nothing"; } + MutableColumnPtr clone_dummy(size_t s_) const override { return ColumnNothing::create(s_); } + + bool can_be_inside_nullable() const override { return true; } Review Comment: warning: function 'can_be_inside_nullable' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] bool can_be_inside_nullable() const override { return true; } ``` ########## be/src/vec/columns/column_nullable.h: ########## @@ -391,6 +394,8 @@ ColumnPtr index(const IColumn& indexes, size_t limit) const override; + bool is_predicate_column() const override { return nested_column->is_predicate_column(); } Review Comment: warning: function 'is_predicate_column' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] bool is_predicate_column() const override { return nested_column->is_predicate_column(); } ``` ########## be/src/vec/columns/column_object.h: ########## @@ -20,6 +20,8 @@ #pragma once #include <glog/logging.h> Review Comment: warning: 'glog/logging.h' file not found [clang-diagnostic-error] ```cpp #include <glog/logging.h> ^ ``` ########## be/src/vec/columns/column_object.h: ########## @@ -100,7 +109,13 @@ const DataTypePtr& get_least_common_type() const { return least_common_type.get(); } - const DataTypePtr& get_least_common_typeBase() const { return least_common_type.getBase(); } + const DataTypePtr& get_least_common_typeBase() const { Review Comment: warning: function 'get_least_common_typeBase' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] const DataTypePtr& get_least_common_typeBase() const { ``` ########## be/src/vec/columns/column_object.h: ########## @@ -100,7 +109,13 @@ const DataTypePtr& get_least_common_type() const { return least_common_type.get(); } - const DataTypePtr& get_least_common_typeBase() const { return least_common_type.getBase(); } + const DataTypePtr& get_least_common_typeBase() const { + return least_common_type.get_base(); + } + + const DataTypeSerDeSPtr& get_least_common_type_serde() const { Review Comment: warning: function 'get_least_common_type_serde' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] const DataTypeSerDeSPtr& get_least_common_type_serde() const { ``` ########## be/src/vec/columns/column_object.h: ########## @@ -154,18 +173,22 @@ const DataTypePtr& get() const { return type; } - const DataTypePtr& getBase() const { return base_type; } + const DataTypePtr& get_base() const { return base_type; } Review Comment: warning: function 'get_base' should be marked [[nodiscard]] [modernize-use-nodiscard] ```suggestion [[nodiscard]] const DataTypePtr& get_base() const { return base_type; } ``` ########## be/src/vec/columns/column_object.cpp: ########## @@ -858,28 +905,318 @@ [](const auto& entry) { return entry->data.is_finalized(); }); } -void ColumnObject::finalize() { +static bool check_if_valid_column_name(const PathInData& path) { + static const std::regex COLUMN_NAME_REGEX("^[_a-zA-Z@0-9][.a-zA-Z0-9_+-/><?@#$%^&*]{0,255}$"); + return std::regex_match(path.get_path(), COLUMN_NAME_REGEX); +} + +void ColumnObject::Subcolumn::wrapp_array_nullable() { + // Wrap array with nullable, treat empty array as null to elimate conflict at present + auto& result_column = get_finalized_column_ptr(); + if (result_column->is_column_array() && !result_column->is_nullable()) { + auto new_null_map = ColumnUInt8::create(); + new_null_map->reserve(result_column->size()); + auto& null_map_data = new_null_map->get_data(); + auto array = static_cast<const ColumnArray*>(result_column.get()); + for (size_t i = 0; i < array->size(); ++i) { + null_map_data.push_back(array->is_default_at(i)); + } + result_column = ColumnNullable::create(std::move(result_column), std::move(new_null_map)); + data_types[0] = make_nullable(data_types[0]); + least_common_type = LeastCommonType {data_types[0]}; + } +} + +rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInData& path, + int idx = 0) { + if (idx >= path.get_parts().size()) { + return &json; + } + + std::string_view current_key = path.get_parts()[idx].key; + if (!json.IsObject()) { + return nullptr; + } + rapidjson::Value name(current_key.data(), current_key.size()); + auto it = json.FindMember(name); + if (it == json.MemberEnd()) { + return nullptr; + } + rapidjson::Value& current = it->value; + // if (idx == path.get_parts().size() - 1) { + // return ¤t; + // } + return find_leaf_node_by_path(current, path, idx + 1); +} + +void find_and_set_leave_value(const IColumn* column, const PathInData& path, + const DataTypeSerDeSPtr& type, rapidjson::Value& root, + rapidjson::Document::AllocatorType& allocator, int row) { + const auto* nullable = assert_cast<const ColumnNullable*>(column); + if (nullable->is_null_at(row)) { + return; + } + // TODO could cache the result of leaf nodes with it's path info + rapidjson::Value* target = find_leaf_node_by_path(root, path); + if (UNLIKELY(!target)) { + rapidjson::StringBuffer buffer; + rapidjson::Writer<rapidjson::StringBuffer> writer(buffer); + root.Accept(writer); + LOG(FATAL) << "could not find path " << path.get_path() + << ", root: " << std::string(buffer.GetString(), buffer.GetSize()); + } + type->write_one_cell_to_json(*column, *target, allocator, row); +} + +// compact null values +// {"a" : {"b" : "d" {"n" : null}, "e" : null}, "c" : 10 } +// after compact -> {"a" : {"c"} : 10} +void compact_null_values(rapidjson::Value& json, rapidjson::Document::AllocatorType& allocator) { + if (!json.IsObject() || json.IsNull()) { + return; + } + + rapidjson::Value::MemberIterator it = json.MemberBegin(); + while (it != json.MemberEnd()) { + rapidjson::Value& value = it->value; + if (value.IsNull()) { + it = json.EraseMember(it); + continue; + } + compact_null_values(value, allocator); + if (value.IsObject() && value.ObjectEmpty()) { + it = json.EraseMember(it); + continue; + } + ++it; + } +} + +// Construct rapidjson value from Subcolumns +void get_json_by_column_tree(rapidjson::Value& root, rapidjson::Document::AllocatorType& allocator, + const ColumnObject::Subcolumns::Node* node_root) { + if (node_root == nullptr || node_root->children.empty()) { + root.SetNull(); + return; + } + root.SetObject(); + for (auto it = node_root->children.begin(); it != node_root->children.end(); ++it) { + auto child = it->get_second(); + rapidjson::Value value(rapidjson::kObjectType); + get_json_by_column_tree(value, allocator, child.get()); + root.AddMember(rapidjson::StringRef(it->get_first().data, it->get_first().size), value, + allocator); + } +} + +bool ColumnObject::serialize_one_row_to_string(int row, std::string* output) const { + if (!is_finalized()) { + const_cast<ColumnObject*>(this)->finalize(); + } + rapidjson::StringBuffer buf; + if (is_scalar_variant()) { + auto type = get_root_type(); + *output = type->to_string(*get_root(), row); + return true; + } + bool res = serialize_one_row_to_json_format(row, &buf, nullptr); + if (res) { + // TODO avoid copy + *output = std::string(buf.GetString(), buf.GetSize()); + } + return res; +} + +bool ColumnObject::serialize_one_row_to_string(int row, BufferWritable& output) const { + if (!is_finalized()) { + const_cast<ColumnObject*>(this)->finalize(); + } + if (is_scalar_variant()) { + auto type = get_root_type(); + type->to_string(*get_root(), row, output); + return true; + } + rapidjson::StringBuffer buf; + bool res = serialize_one_row_to_json_format(row, &buf, nullptr); + if (res) { + output.write(buf.GetString(), buf.GetLength()); + } + return res; +} + +bool ColumnObject::serialize_one_row_to_json_format(int row, rapidjson::StringBuffer* output, + bool* is_null) const { + CHECK(is_finalized()); + if (subcolumns.empty()) { + if (is_null != nullptr) { + *is_null = true; + } else { + rapidjson::Value root(rapidjson::kNullType); + rapidjson::Writer<rapidjson::StringBuffer> writer(*output); + return root.Accept(writer); + } + return true; + } + CHECK(size() > row); + rapidjson::StringBuffer buffer; + rapidjson::Value root(rapidjson::kNullType); + if (doc_structure == nullptr) { + doc_structure = std::make_shared<rapidjson::Document>(); + rapidjson::Document::AllocatorType& allocator = doc_structure->GetAllocator(); + get_json_by_column_tree(*doc_structure, allocator, subcolumns.get_root()); + } + if (!doc_structure->IsNull()) { + root.CopyFrom(*doc_structure, doc_structure->GetAllocator()); + } +#ifndef NDEBUG + VLOG_DEBUG << "dump structure " << JsonFunctions::print_json_value(*doc_structure); +#endif + for (const auto& subcolumn : subcolumns) { + find_and_set_leave_value(subcolumn->data.get_finalized_column_ptr(), subcolumn->path, + subcolumn->data.get_least_common_type_serde(), root, + doc_structure->GetAllocator(), row); + } + compact_null_values(root, doc_structure->GetAllocator()); + if (root.IsNull() && is_null != nullptr) { + // Fast path + *is_null = true; + } else { + output->Clear(); + rapidjson::Writer<rapidjson::StringBuffer> writer(*output); + return root.Accept(writer); + } + return true; +} + +void ColumnObject::merge_sparse_to_root_column() { + CHECK(is_finalized()); + if (sparse_columns.empty()) { + return; + } + ColumnPtr src = subcolumns.get_mutable_root()->data.get_finalized_column_ptr(); + MutableColumnPtr mresult = src->clone_empty(); + const ColumnNullable* src_null = assert_cast<const ColumnNullable*>(src.get()); + const ColumnString* src_column_ptr = + assert_cast<const ColumnString*>(&src_null->get_nested_column()); + rapidjson::StringBuffer buffer; + doc_structure = std::make_shared<rapidjson::Document>(); + rapidjson::Document::AllocatorType& allocator = doc_structure->GetAllocator(); + get_json_by_column_tree(*doc_structure, allocator, sparse_columns.get_root()); + +#ifndef NDEBUG + VLOG_DEBUG << "dump structure " << JsonFunctions::print_json_value(*doc_structure); +#endif + + ColumnNullable* result_column_nullable = + assert_cast<ColumnNullable*>(mresult->assume_mutable().get()); + ColumnString* result_column_ptr = + assert_cast<ColumnString*>(&result_column_nullable->get_nested_column()); + result_column_nullable->reserve(num_rows); + // parse each row to jsonb + for (size_t i = 0; i < num_rows; ++i) { + // root is not null, store original value, eg. the root is scalar type like '[1]' + if (!src_null->empty() && !src_null->is_null_at(i)) { + result_column_ptr->insert_data(src_column_ptr->get_data_at(i).data, + src_column_ptr->get_data_at(i).size); + result_column_nullable->get_null_map_data().push_back(0); + continue; + } + + // parse and encode sparse columns + buffer.Clear(); + rapidjson::Value root(rapidjson::kNullType); + if (!doc_structure->IsNull()) { + root.CopyFrom(*doc_structure, doc_structure->GetAllocator()); + } + size_t null_count = 0; + for (const auto& subcolumn : sparse_columns) { + auto& column = subcolumn->data.get_finalized_column_ptr(); + if (assert_cast<const ColumnNullable&>(*column).is_null_at(i)) { + ++null_count; + continue; + } + find_and_set_leave_value(column, subcolumn->path, + subcolumn->data.get_least_common_type_serde(), root, + doc_structure->GetAllocator(), i); + } + + // all null values, store null to sparse root + if (null_count == sparse_columns.size()) { + result_column_ptr->insert_default(); + result_column_nullable->get_null_map_data().push_back(1); + continue; + } + + // encode sparse columns into jsonb format + compact_null_values(root, doc_structure->GetAllocator()); + // parse as jsonb value and put back to rootnode + // TODO, we could convert to jsonb directly from rapidjson::Value for better performance, instead of parsing + JsonbParser parser; + rapidjson::Writer<rapidjson::StringBuffer> writer(buffer); + root.Accept(writer); + bool res = parser.parse(buffer.GetString(), buffer.GetSize()); + CHECK(res) << "buffer:" << std::string(buffer.GetString(), buffer.GetSize()) + << ", row_num:" << i; + result_column_ptr->insert_data(parser.getWriter().getOutput()->getBuffer(), + parser.getWriter().getOutput()->getSize()); + result_column_nullable->get_null_map_data().push_back(0); + } + + // assign merged column + subcolumns.get_mutable_root()->data.get_finalized_column_ptr() = mresult->get_ptr(); +} + +void ColumnObject::finalize(bool ignore_sparse) { Review Comment: warning: method 'finalize' can be made const [readability-make-member-function-const] be/src/vec/columns/column_object.h:328: ```diff - void finalize(bool ignore_sparse); + void finalize(bool ignore_sparse) const; ``` ```suggestion void ColumnObject::finalize(bool ignore_sparse) const { ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org