Re: [PR] [fix](Variant Type) Add sparse columns meta to fix compaction [doris]

via GitHub Tue, 30 Jan 2024 00:50:52 -0800


xiaokang commented on code in PR #28673:
URL: https://github.com/apache/doris/pull/28673#discussion_r1470534003



##########
gensrc/proto/olap_file.proto:
##########
@@ -343,6 +343,7 @@ message TabletSchemaPB {
     optional bool enable_single_replica_compaction = 22 [default=false];
     optional bool skip_write_index_on_load = 23 [default=false];
     repeated int32 cluster_key_idxes = 24;
+    repeated ColumnPB sparse_column = 25;  // sparse column within a variant 
column

Review Comment:
   There are some problems using a sperated field:
   1. the mapping to variant columns should change on add/drop a variant column
   2. add more chance for meta conflict



##########
be/src/vec/common/schema_util.cpp:
##########
@@ -484,6 +548,92 @@ void encode_variant_sparse_subcolumns(Block& block, const 
std::vector<int>& vari
     }
 }
 
+void _append_column(const TabletColumn& parent_variant,
+                    const ColumnObject::Subcolumns::NodePtr& subcolumn, 
TabletSchemaSPtr& to_append,
+                    bool is_sparse) {
+    // If column already exist in original tablet schema, then we pick common 
type
+    // and cast column to common type, and modify tablet column to common type,
+    // otherwise it's a new column
+    const std::string& column_name =
+            parent_variant.name_lower_case() + "." + 
subcolumn->path.get_path();
+    const vectorized::DataTypePtr& final_data_type_from_object =
+            subcolumn->data.get_least_common_type();
+    vectorized::PathInDataBuilder full_path_builder;
+    auto full_path = 
full_path_builder.append(parent_variant.name_lower_case(), false)
+                             .append(subcolumn->path.get_parts(), false)
+                             .build();
+    TabletColumn tablet_column = vectorized::schema_util::get_column_by_type(
+            final_data_type_from_object, column_name,
+            vectorized::schema_util::ExtraInfo {.unique_id = -1,
+                                                .parent_unique_id = 
parent_variant.unique_id(),
+                                                .path_info = full_path});
+
+    if (!is_sparse) {
+        to_append->append_column(std::move(tablet_column));
+    } else {
+        to_append->append_sparse_column(std::move(tablet_column));
+    }
+}
+
+void rebuild_schema_and_block(const TabletSchemaSPtr& original,
+                              const std::vector<int>& variant_positions, 
Block& flush_block,
+                              TabletSchemaSPtr& flush_schema) {
+    // rebuild schema and block with variant extracted columns
+
+    // 1. Flatten variant column into flat columns, append flatten columns to 
the back of original Block and TabletSchema
+    // those columns are extracted columns, leave none extracted columns 
remain in original variant column, which is
+    // JSONB format at present.
+    // 2. Collect columns that need to be added or modified when data type 
changes or new columns encountered
+    for (size_t variant_pos : variant_positions) {
+        auto column_ref = flush_block.get_by_position(variant_pos).column;
+        bool is_nullable = column_ref->is_nullable();
+        const vectorized::ColumnObject& object_column = 
assert_cast<vectorized::ColumnObject&>(
+                remove_nullable(column_ref)->assume_mutable_ref());
+        const TabletColumn& parent_column = original->columns()[variant_pos];
+        CHECK(object_column.is_finalized());
+        std::shared_ptr<vectorized::ColumnObject::Subcolumns::Node> root;
+        // common extracted columns
+        for (const auto& entry : object_column.get_subcolumns()) {
+            if (entry->path.empty()) {
+                // root
+                root = entry;
+                continue;
+            }
+            _append_column(parent_column, entry, flush_schema, false);
+            const std::string& column_name =
+                    parent_column.name_lower_case() + "." + 
entry->path.get_path();
+            
flush_block.insert({entry->data.get_finalized_column_ptr()->get_ptr(),
+                                entry->data.get_least_common_type(), 
column_name});
+        }
+
+        // add sparse columns to flush_schema
+        for (const auto& entry : object_column.get_sparse_subcolumns()) {
+            _append_column(parent_column, entry, flush_schema, true);

Review Comment:
   no flush_block.insert() ?



##########
be/src/vec/data_types/data_type_factory.cpp:
##########
@@ -586,7 +586,7 @@ DataTypePtr DataTypeFactory::create_data_type(const 
segment_v2::ColumnMetaPB& pc
     DataTypePtr nested = nullptr;
     if (pcolumn.type() == static_cast<int>(FieldType::OLAP_FIELD_TYPE_ARRAY)) {
         // Item subcolumn and length subcolumn
-        DCHECK_GE(pcolumn.children_columns().size(), 2) << 
pcolumn.DebugString();
+        DCHECK_GE(pcolumn.children_columns().size(), 1) << 
pcolumn.DebugString();

Review Comment:
   why change children columns from 2 to 1?



##########
gensrc/proto/olap_file.proto:
##########
@@ -343,6 +343,7 @@ message TabletSchemaPB {
     optional bool enable_single_replica_compaction = 22 [default=false];
     optional bool skip_write_index_on_load = 23 [default=false];
     repeated int32 cluster_key_idxes = 24;
+    repeated ColumnPB sparse_column = 25;  // sparse column within a variant 
column

Review Comment:
   Can you store it inside variant column?



##########
be/src/olap/rowset/segment_v2/segment.cpp:
##########
@@ -472,31 +510,26 @@ Status Segment::new_column_iterator_with_path(const 
TabletColumn& tablet_column,
     // then we could optimize to generate a default iterator
     // This file doest not contain this column, so only read from sparse column
     // to avoid read amplification

Review Comment:
   add comment for each branch: leaf node, mid node with data, mid node without 
data



##########
be/src/olap/rowset/segment_v2/segment.cpp:
##########
@@ -472,31 +510,26 @@ Status Segment::new_column_iterator_with_path(const 
TabletColumn& tablet_column,
     // then we could optimize to generate a default iterator
     // This file doest not contain this column, so only read from sparse column
     // to avoid read amplification
-    if (node != nullptr && node->is_scalar() && node->children.empty()) {
+    if (node != nullptr && node->is_scalar() && node->children.empty() && 
sparse_node == nullptr) {

Review Comment:
   wrap function is_leaf_node() ...



##########
be/src/olap/rowset/segment_v2/segment.cpp:
##########
@@ -472,31 +510,26 @@ Status Segment::new_column_iterator_with_path(const 
TabletColumn& tablet_column,
     // then we could optimize to generate a default iterator
     // This file doest not contain this column, so only read from sparse column
     // to avoid read amplification
-    if (node != nullptr && node->is_scalar() && node->children.empty()) {
+    if (node != nullptr && node->is_scalar() && node->children.empty() && 
sparse_node == nullptr) {

Review Comment:
   is_scalar() => has_data()



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Re: [PR] [fix](Variant Type) Add sparse columns meta to fix compaction [doris]

Reply via email to