xiaokang commented on code in PR #28673: URL: https://github.com/apache/doris/pull/28673#discussion_r1470534003
########## gensrc/proto/olap_file.proto: ########## @@ -343,6 +343,7 @@ message TabletSchemaPB { optional bool enable_single_replica_compaction = 22 [default=false]; optional bool skip_write_index_on_load = 23 [default=false]; repeated int32 cluster_key_idxes = 24; + repeated ColumnPB sparse_column = 25; // sparse column within a variant column Review Comment: There are some problems using a sperated field: 1. the mapping to variant columns should change on add/drop a variant column 2. add more chance for meta conflict ########## be/src/vec/common/schema_util.cpp: ########## @@ -484,6 +548,92 @@ void encode_variant_sparse_subcolumns(Block& block, const std::vector<int>& vari } } +void _append_column(const TabletColumn& parent_variant, + const ColumnObject::Subcolumns::NodePtr& subcolumn, TabletSchemaSPtr& to_append, + bool is_sparse) { + // If column already exist in original tablet schema, then we pick common type + // and cast column to common type, and modify tablet column to common type, + // otherwise it's a new column + const std::string& column_name = + parent_variant.name_lower_case() + "." + subcolumn->path.get_path(); + const vectorized::DataTypePtr& final_data_type_from_object = + subcolumn->data.get_least_common_type(); + vectorized::PathInDataBuilder full_path_builder; + auto full_path = full_path_builder.append(parent_variant.name_lower_case(), false) + .append(subcolumn->path.get_parts(), false) + .build(); + TabletColumn tablet_column = vectorized::schema_util::get_column_by_type( + final_data_type_from_object, column_name, + vectorized::schema_util::ExtraInfo {.unique_id = -1, + .parent_unique_id = parent_variant.unique_id(), + .path_info = full_path}); + + if (!is_sparse) { + to_append->append_column(std::move(tablet_column)); + } else { + to_append->append_sparse_column(std::move(tablet_column)); + } +} + +void rebuild_schema_and_block(const TabletSchemaSPtr& original, + const std::vector<int>& variant_positions, Block& flush_block, + TabletSchemaSPtr& flush_schema) { + // rebuild schema and block with variant extracted columns + + // 1. Flatten variant column into flat columns, append flatten columns to the back of original Block and TabletSchema + // those columns are extracted columns, leave none extracted columns remain in original variant column, which is + // JSONB format at present. + // 2. Collect columns that need to be added or modified when data type changes or new columns encountered + for (size_t variant_pos : variant_positions) { + auto column_ref = flush_block.get_by_position(variant_pos).column; + bool is_nullable = column_ref->is_nullable(); + const vectorized::ColumnObject& object_column = assert_cast<vectorized::ColumnObject&>( + remove_nullable(column_ref)->assume_mutable_ref()); + const TabletColumn& parent_column = original->columns()[variant_pos]; + CHECK(object_column.is_finalized()); + std::shared_ptr<vectorized::ColumnObject::Subcolumns::Node> root; + // common extracted columns + for (const auto& entry : object_column.get_subcolumns()) { + if (entry->path.empty()) { + // root + root = entry; + continue; + } + _append_column(parent_column, entry, flush_schema, false); + const std::string& column_name = + parent_column.name_lower_case() + "." + entry->path.get_path(); + flush_block.insert({entry->data.get_finalized_column_ptr()->get_ptr(), + entry->data.get_least_common_type(), column_name}); + } + + // add sparse columns to flush_schema + for (const auto& entry : object_column.get_sparse_subcolumns()) { + _append_column(parent_column, entry, flush_schema, true); Review Comment: no flush_block.insert() ? ########## be/src/vec/data_types/data_type_factory.cpp: ########## @@ -586,7 +586,7 @@ DataTypePtr DataTypeFactory::create_data_type(const segment_v2::ColumnMetaPB& pc DataTypePtr nested = nullptr; if (pcolumn.type() == static_cast<int>(FieldType::OLAP_FIELD_TYPE_ARRAY)) { // Item subcolumn and length subcolumn - DCHECK_GE(pcolumn.children_columns().size(), 2) << pcolumn.DebugString(); + DCHECK_GE(pcolumn.children_columns().size(), 1) << pcolumn.DebugString(); Review Comment: why change children columns from 2 to 1? ########## gensrc/proto/olap_file.proto: ########## @@ -343,6 +343,7 @@ message TabletSchemaPB { optional bool enable_single_replica_compaction = 22 [default=false]; optional bool skip_write_index_on_load = 23 [default=false]; repeated int32 cluster_key_idxes = 24; + repeated ColumnPB sparse_column = 25; // sparse column within a variant column Review Comment: Can you store it inside variant column? ########## be/src/olap/rowset/segment_v2/segment.cpp: ########## @@ -472,31 +510,26 @@ Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column, // then we could optimize to generate a default iterator // This file doest not contain this column, so only read from sparse column // to avoid read amplification Review Comment: add comment for each branch: leaf node, mid node with data, mid node without data ########## be/src/olap/rowset/segment_v2/segment.cpp: ########## @@ -472,31 +510,26 @@ Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column, // then we could optimize to generate a default iterator // This file doest not contain this column, so only read from sparse column // to avoid read amplification - if (node != nullptr && node->is_scalar() && node->children.empty()) { + if (node != nullptr && node->is_scalar() && node->children.empty() && sparse_node == nullptr) { Review Comment: wrap function is_leaf_node() ... ########## be/src/olap/rowset/segment_v2/segment.cpp: ########## @@ -472,31 +510,26 @@ Status Segment::new_column_iterator_with_path(const TabletColumn& tablet_column, // then we could optimize to generate a default iterator // This file doest not contain this column, so only read from sparse column // to avoid read amplification - if (node != nullptr && node->is_scalar() && node->children.empty()) { + if (node != nullptr && node->is_scalar() && node->children.empty() && sparse_node == nullptr) { Review Comment: is_scalar() => has_data() -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org