xiaokang commented on code in PR #39022: URL: https://github.com/apache/doris/pull/39022#discussion_r1721964067
########## be/src/vec/json/path_in_data.cpp: ########## @@ -170,6 +178,33 @@ PathInData PathInData::copy_pop_front() const { return copy_pop_nfront(1); } +PathInData PathInData::get_nested_prefix_path() const { + CHECK(has_nested_part()); + PathInData new_path; + Parts new_parts; + for (const Part& part : parts) { + new_parts.push_back(part); + if (part.is_nested) { + break; + } + } + new_path.build_path(new_parts); Review Comment: build_path and build_parts can be combined since they are usually used together and build_parts relies on path. ########## be/src/vec/json/json_parser.cpp: ########## @@ -71,14 +71,15 @@ bool JSONDataParser<ParserImpl, parse_nested>::extract_key(MutableColumns& colum } template <typename ParserImpl, bool parse_nested> -std::optional<ParseResult> JSONDataParser<ParserImpl, parse_nested>::parse(const char* begin, - size_t length) { +std::optional<ParseResult> JSONDataParser<ParserImpl, parse_nested>::parse( + const char* begin, size_t length, const ParseConfig& config) { std::string_view json {begin, length}; Element document; if (!parser.parse(json, document)) { Review Comment: unnecessary string_view wrapper, since underlying parser impl use data and size. ########## be/src/vec/columns/column_object.cpp: ########## @@ -884,8 +996,62 @@ void ColumnObject::get(size_t n, Field& res) const { auto& object = res.get<VariantMap&>(); for (const auto& entry : subcolumns) { - auto it = object.try_emplace(entry->path.get_path()).first; - entry->data.get(n, it->second); + Field field; + entry->data.get(n, field); + // Notice: we treat null as empty field + if (field.get_type() != Field::Types::Null) { + object.try_emplace(entry->path.get_path(), field); + } + } + if (object.empty()) { + res = Null(); + } +} + +void ColumnObject::add_nested_subcolumn(const PathInData& key, const FieldInfo& field_info, + size_t new_size) { + if (!key.has_nested_part()) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Cannot add Nested subcolumn, because path doesn't contain Nested"); + } + + bool inserted = false; + /// We find node that represents the same Nested type as @key. + const auto* nested_node = subcolumns.find_best_match(key); Review Comment: add comment TODO ########## be/src/vec/json/json_parser.cpp: ########## @@ -97,7 +98,7 @@ void JSONDataParser<ParserImpl, parse_nested>::traverse(const Element& element, } else if (element.isArray()) { has_nested = false; checkHasNested(element); Review Comment: checkHasNested -> check_has_nested_object ########## be/src/vec/columns/column_object.cpp: ########## @@ -1132,6 +1298,44 @@ rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInDat return find_leaf_node_by_path(current, path, idx + 1); } +// skip empty json: +// 1. null value as empty json +// 2. nested array with only nulls, eg. [null. null] Review Comment: add comment for real data [null, null] TODO ########## be/src/vec/columns/column_object.cpp: ########## @@ -1132,6 +1298,44 @@ rapidjson::Value* find_leaf_node_by_path(rapidjson::Value& json, const PathInDat return find_leaf_node_by_path(current, path, idx + 1); } +// skip empty json: +// 1. null value as empty json +// 2. nested array with only nulls, eg. [null. null] Review Comment: Can we distinguish real data [null, null] and filled data [null, null]. I think it's possible using the null_bitmap of ColumnNullable(ColumnArray). ########## be/src/vec/data_types/data_type_jsonb.h: ########## @@ -68,7 +68,7 @@ class DataTypeJsonb final : public IDataType { MutableColumnPtr create_column() const override; virtual Field get_default() const override { - std::string default_json = "{}"; + std::string default_json = "null"; Review Comment: We can test and follow MySQL's JSON default value. ########## be/src/vec/columns/column_object.cpp: ########## @@ -384,9 +425,9 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info) { } if (data.empty()) { add_new_column_part(create_array_of_type(base_type.idx, value_dim, is_nullable)); - } else if (least_common_type.get_type_id() != base_type.idx && !base_type.is_nothing()) { - if (schema_util::is_conversion_required_between_integers(base_type.idx, - least_common_type.get_type_id())) { + } else if (least_common_type.get_base_type_id() != base_type.idx && !base_type.is_nothing()) { Review Comment: add comment for base_type_id and type_id ########## be/src/olap/rowset/segment_v2/column_reader.cpp: ########## @@ -1663,4 +1663,58 @@ Status VariantRootColumnIterator::read_by_rowids(const rowid_t* rowids, const si return Status::OK(); } +Status DefaultNestedColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst) { + bool has_null = false; + return next_batch(n, dst, &has_null); +} + +static void fill_nested_with_defaults(vectorized::MutableColumnPtr& dst, + vectorized::MutableColumnPtr& sibling_column, size_t nrows) { + const auto* sibling_array = vectorized::check_and_get_column<vectorized::ColumnArray>( + remove_nullable(sibling_column->get_ptr())); + CHECK(sibling_array) << "Expected array column, but mmet " << sibling_column->get_name(); + ; + const auto* dst_array = vectorized::check_and_get_column<vectorized::ColumnArray>( + remove_nullable(dst->get_ptr())); + if (!dst_array || !sibling_array) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Expected array column, but met %s and %s", dst->get_name(), + sibling_column->get_name()); + } + auto new_nested = + dst_array->get_data_ptr()->clone_resized(sibling_array->get_data_ptr()->size()); + auto new_array = make_nullable(vectorized::ColumnArray::create( + new_nested->assume_mutable(), sibling_array->get_offsets_ptr()->assume_mutable())); + dst->insert_range_from(*new_array, 0, new_array->size()); Review Comment: OK. Can you just set dst to new_array to avoid data copy in `insert_range_from` ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org