Re: [PR] [Feature-Variant](Variant Type) support variant type query and index [doris]

via GitHub Sun, 26 Nov 2023 17:57:35 -0800


xiaokang commented on code in PR #26749:
URL: https://github.com/apache/doris/pull/26749#discussion_r1405391529



##########
be/src/olap/rowset/segment_creator.cpp:
##########
@@ -280,6 +441,17 @@ Status SegmentCreator::add_block(const vectorized::Block* 
block) {
     size_t row_avg_size_in_bytes = std::max((size_t)1, block_size_in_bytes / 
block_row_num);
     size_t row_offset = 0;
 
+    if (_segment_flusher.need_buffering()) {
+        const static int MAX_BUFFER_SIZE = 
config::flushing_block_buffer_size_bytes; // 400M

Review Comment:
   can we reuse config::write_buffer_size, since the sematics is almost the 
same?



##########
be/src/olap/rowset/segment_creator.cpp:
##########
@@ -40,32 +49,178 @@ SegmentFlusher::SegmentFlusher() = default;
 
 SegmentFlusher::~SegmentFlusher() = default;
 
-Status SegmentFlusher::init(const RowsetWriterContext& rowset_writer_context) {
-    _context = rowset_writer_context;
+Status SegmentFlusher::init(RowsetWriterContext& rowset_writer_context) {
+    _context = &rowset_writer_context;
     return Status::OK();
 }
 
 Status SegmentFlusher::flush_single_block(const vectorized::Block* block, 
int32_t segment_id,
-                                          int64_t* flush_size, 
TabletSchemaSPtr flush_schema) {
+                                          int64_t* flush_size) {
     if (block->rows() == 0) {
         return Status::OK();
     }
-    bool no_compression = block->bytes() <= 
config::segment_compression_threshold_kb * 1024;
+    TabletSchemaSPtr flush_schema = nullptr;
+    // Expand variant columns
+    vectorized::Block flush_block(*block);
+    if (_context->write_type != DataWriteType::TYPE_COMPACTION &&
+        _context->tablet_schema->num_variant_columns() > 0) {
+        RETURN_IF_ERROR(_expand_variant_to_subcolumns(flush_block, 
flush_schema));
+    }
+    bool no_compression = flush_block.bytes() <= 
config::segment_compression_threshold_kb * 1024;
     if (config::enable_vertical_segment_writer &&
-        _context.tablet_schema->cluster_key_idxes().empty()) {
+        _context->tablet_schema->cluster_key_idxes().empty()) {
         std::unique_ptr<segment_v2::VerticalSegmentWriter> writer;
         RETURN_IF_ERROR(_create_segment_writer(writer, segment_id, 
no_compression, flush_schema));
-        RETURN_IF_ERROR(_add_rows(writer, block, 0, block->rows()));
+        RETURN_IF_ERROR(_add_rows(writer, &flush_block, 0, 
flush_block.rows()));
         RETURN_IF_ERROR(_flush_segment_writer(writer, flush_size));
     } else {
         std::unique_ptr<segment_v2::SegmentWriter> writer;
         RETURN_IF_ERROR(_create_segment_writer(writer, segment_id, 
no_compression, flush_schema));
-        RETURN_IF_ERROR(_add_rows(writer, block, 0, block->rows()));
+        RETURN_IF_ERROR(_add_rows(writer, &flush_block, 0, 
flush_block.rows()));
         RETURN_IF_ERROR(_flush_segment_writer(writer, flush_size));
     }
     return Status::OK();
 }
 
+Status SegmentFlusher::_expand_variant_to_subcolumns(vectorized::Block& block,
+                                                     TabletSchemaSPtr& 
flush_schema) {
+    size_t num_rows = block.rows();
+    if (num_rows == 0) {
+        return Status::OK();
+    }
+
+    std::vector<int> variant_column_pos;
+    if (_context->partial_update_info && 
_context->partial_update_info->is_partial_update) {
+        // check columns that used to do partial updates should not include 
variant
+        for (int i : _context->partial_update_info->update_cids) {
+            const auto& col = _context->tablet_schema->columns()[i];
+            if (!col.is_key() && col.name() != DELETE_SIGN) {
+                return Status::InvalidArgument(
+                        "Not implement partial update for variant only support 
delete currently");
+            }
+        }
+    } else {
+        for (int i = 0; i < _context->tablet_schema->columns().size(); ++i) {
+            if (_context->tablet_schema->columns()[i].is_variant_type()) {
+                variant_column_pos.push_back(i);
+            }
+        }
+    }
+
+    if (variant_column_pos.empty()) {
+        return Status::OK();
+    }
+
+    try {
+        // Parse each variant column from raw string column
+        vectorized::schema_util::parse_variant_columns(block, 
variant_column_pos);
+        vectorized::schema_util::finalize_variant_columns(block, 
variant_column_pos,
+                                                          false /*not ingore 
sparse*/);
+        vectorized::schema_util::encode_variant_sparse_subcolumns(block, 
variant_column_pos);
+    } catch (const doris::Exception& e) {
+        // TODO more graceful, max_filter_ratio
+        LOG(WARNING) << "encounter execption " << e.to_string();
+        return Status::InternalError(e.to_string());
+    }
+
+    // Dynamic Block consists of two parts, dynamic part of columns and static 
part of columns
+    //     static     extracted
+    // | --------- | ----------- |
+    // The static ones are original _tablet_schame columns
+    flush_schema = std::make_shared<TabletSchema>();
+    flush_schema->copy_from(*_context->original_tablet_schema);
+
+    vectorized::Block flush_block(std::move(block));
+    // If column already exist in original tablet schema, then we pick common 
type
+    // and cast column to common type, and modify tablet column to common type,
+    // otherwise it's a new column, we should add to frontend

Review Comment:
   Is 'add to frontend' the old comment for dynamic table?



##########
be/src/olap/rowset/segment_creator.cpp:
##########
@@ -40,32 +49,178 @@ SegmentFlusher::SegmentFlusher() = default;
 
 SegmentFlusher::~SegmentFlusher() = default;
 
-Status SegmentFlusher::init(const RowsetWriterContext& rowset_writer_context) {
-    _context = rowset_writer_context;
+Status SegmentFlusher::init(RowsetWriterContext& rowset_writer_context) {
+    _context = &rowset_writer_context;
     return Status::OK();
 }
 
 Status SegmentFlusher::flush_single_block(const vectorized::Block* block, 
int32_t segment_id,
-                                          int64_t* flush_size, 
TabletSchemaSPtr flush_schema) {
+                                          int64_t* flush_size) {
     if (block->rows() == 0) {
         return Status::OK();
     }
-    bool no_compression = block->bytes() <= 
config::segment_compression_threshold_kb * 1024;
+    TabletSchemaSPtr flush_schema = nullptr;
+    // Expand variant columns
+    vectorized::Block flush_block(*block);
+    if (_context->write_type != DataWriteType::TYPE_COMPACTION &&
+        _context->tablet_schema->num_variant_columns() > 0) {
+        RETURN_IF_ERROR(_expand_variant_to_subcolumns(flush_block, 
flush_schema));
+    }
+    bool no_compression = flush_block.bytes() <= 
config::segment_compression_threshold_kb * 1024;
     if (config::enable_vertical_segment_writer &&
-        _context.tablet_schema->cluster_key_idxes().empty()) {
+        _context->tablet_schema->cluster_key_idxes().empty()) {
         std::unique_ptr<segment_v2::VerticalSegmentWriter> writer;
         RETURN_IF_ERROR(_create_segment_writer(writer, segment_id, 
no_compression, flush_schema));
-        RETURN_IF_ERROR(_add_rows(writer, block, 0, block->rows()));
+        RETURN_IF_ERROR(_add_rows(writer, &flush_block, 0, 
flush_block.rows()));
         RETURN_IF_ERROR(_flush_segment_writer(writer, flush_size));
     } else {
         std::unique_ptr<segment_v2::SegmentWriter> writer;
         RETURN_IF_ERROR(_create_segment_writer(writer, segment_id, 
no_compression, flush_schema));
-        RETURN_IF_ERROR(_add_rows(writer, block, 0, block->rows()));
+        RETURN_IF_ERROR(_add_rows(writer, &flush_block, 0, 
flush_block.rows()));
         RETURN_IF_ERROR(_flush_segment_writer(writer, flush_size));
     }
     return Status::OK();
 }
 
+Status SegmentFlusher::_expand_variant_to_subcolumns(vectorized::Block& block,
+                                                     TabletSchemaSPtr& 
flush_schema) {
+    size_t num_rows = block.rows();
+    if (num_rows == 0) {
+        return Status::OK();
+    }
+
+    std::vector<int> variant_column_pos;
+    if (_context->partial_update_info && 
_context->partial_update_info->is_partial_update) {
+        // check columns that used to do partial updates should not include 
variant
+        for (int i : _context->partial_update_info->update_cids) {
+            const auto& col = _context->tablet_schema->columns()[i];
+            if (!col.is_key() && col.name() != DELETE_SIGN) {
+                return Status::InvalidArgument(
+                        "Not implement partial update for variant only support 
delete currently");
+            }
+        }
+    } else {
+        for (int i = 0; i < _context->tablet_schema->columns().size(); ++i) {
+            if (_context->tablet_schema->columns()[i].is_variant_type()) {
+                variant_column_pos.push_back(i);
+            }
+        }
+    }
+
+    if (variant_column_pos.empty()) {
+        return Status::OK();
+    }
+
+    try {
+        // Parse each variant column from raw string column
+        vectorized::schema_util::parse_variant_columns(block, 
variant_column_pos);
+        vectorized::schema_util::finalize_variant_columns(block, 
variant_column_pos,
+                                                          false /*not ingore 
sparse*/);
+        vectorized::schema_util::encode_variant_sparse_subcolumns(block, 
variant_column_pos);
+    } catch (const doris::Exception& e) {
+        // TODO more graceful, max_filter_ratio
+        LOG(WARNING) << "encounter execption " << e.to_string();
+        return Status::InternalError(e.to_string());
+    }
+
+    // Dynamic Block consists of two parts, dynamic part of columns and static 
part of columns
+    //     static     extracted
+    // | --------- | ----------- |
+    // The static ones are original _tablet_schame columns
+    flush_schema = std::make_shared<TabletSchema>();
+    flush_schema->copy_from(*_context->original_tablet_schema);
+
+    vectorized::Block flush_block(std::move(block));
+    // If column already exist in original tablet schema, then we pick common 
type
+    // and cast column to common type, and modify tablet column to common type,
+    // otherwise it's a new column, we should add to frontend
+    auto append_column = [&](const TabletColumn& parent_variant, auto& 
column_entry_from_object) {
+        const std::string& column_name =
+                parent_variant.name_lower_case() + "." + 
column_entry_from_object->path.get_path();
+        const vectorized::DataTypePtr& final_data_type_from_object =
+                column_entry_from_object->data.get_least_common_type();
+        TabletColumn tablet_column;
+        vectorized::PathInDataBuilder full_path_builder;
+        auto full_path = 
full_path_builder.append(parent_variant.name_lower_case(), false)
+                                 
.append(column_entry_from_object->path.get_parts(), false)
+                                 .build();
+        vectorized::schema_util::get_column_by_type(

Review Comment:
   get_column_by_type() just create a TabletColumn by type argument, so can we 
make it return a TabletColumn instead of using a ref argument.



##########
be/src/olap/tablet_schema.cpp:
##########
@@ -1040,9 +1180,12 @@ bool 
TabletSchema::has_inverted_index_with_index_id(int32_t index_id) const {
     return false;
 }
 
-const TabletIndex* TabletSchema::get_inverted_index(int32_t col_unique_id) 
const {
-    // TODO use more efficient impl
+const TabletIndex* TabletSchema::get_inverted_index(int32_t col_unique_id,
+                                                    const std::string& 
suffix_path) const {
     for (size_t i = 0; i < _indexes.size(); i++) {
+        if (_indexes[i].get_escaped_index_suffix_path() != 
escape_for_path_name(suffix_path)) {

Review Comment:
   put it into the next if branch



##########
be/src/olap/tablet_schema.h:
##########
@@ -207,16 +211,15 @@ class TabletIndex {
 
         return 0;
     }
-    TabletIndex(const TabletIndex& other) {
-        _index_id = other._index_id;
-        _index_name = other._index_name;
-        _index_type = other._index_type;
-        _col_unique_ids = other._col_unique_ids;
-        _properties = other._properties;
-    }
+
+    const std::string& get_escaped_index_suffix_path() const { return 
_escaped_index_suffix_path; }

Review Comment:
   Can we simplify the function name to get_suffix_path?



##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -427,6 +428,13 @@ Status BetaRowsetWriter::add_rowset(RowsetSharedPtr 
rowset) {
     if (rowset->rowset_meta()->has_delete_predicate()) {
         
_rowset_meta->set_delete_predicate(rowset->rowset_meta()->delete_predicate());
     }
+    // Update the tablet schema in the rowset metadata if the tablet schema 
contains a variant.
+    // During the build process, _context.tablet_schema will be used as the 
rowset schema.
+    // This situation may arise in the event of a linked schema change. If 
this schema is not set,
+    // the subcolumns of the variant will be lost.
+    if (_context.tablet_schema->num_variant_columns() > 0 && 
rowset->tablet_schema() != nullptr) {
+        _context.tablet_schema = rowset->tablet_schema();

Review Comment:
   Is lock needed here?



##########
be/src/olap/rowset/segment_v2/inverted_index_desc.cpp:
##########
@@ -27,30 +27,32 @@ const std::string segment_suffix = ".dat";
 const std::string index_suffix = ".idx";
 const std::string index_name_separator = "_";
 
-std::string InvertedIndexDescriptor::get_temporary_index_path(const 
std::string& segment_path,
-                                                              uint32_t uuid) {
+std::string InvertedIndexDescriptor::get_temporary_index_path(
+        const std::string& segment_path, uint32_t uuid, const std::string& 
index_suffix_path) {
     return StripSuffixString(segment_path, segment_suffix) + 
index_name_separator +
-           std::to_string(uuid);
+           std::to_string(uuid) + index_suffix_path;
 }
 
 std::string InvertedIndexDescriptor::get_index_file_name(const std::string& 
segment_path,
-                                                         uint32_t uuid) {
+                                                         uint32_t uuid,
+                                                         const std::string& 
index_suffix_path) {
     return StripSuffixString(segment_path, segment_suffix) + 
index_name_separator +
-           std::to_string(uuid) + index_suffix;
+           std::to_string(uuid) + index_suffix_path + index_suffix;
 }
 
-std::string InvertedIndexDescriptor::inverted_index_file_path(const string& 
rowset_dir,
-                                                              const RowsetId& 
rowset_id,
-                                                              int segment_id, 
int64_t index_id) {
+std::string InvertedIndexDescriptor::inverted_index_file_path(
+        const string& rowset_dir, const RowsetId& rowset_id, int segment_id, 
int64_t index_id,
+        const std::string& index_suffix_path) {
     // {rowset_dir}/{schema_hash}/{rowset_id}_{seg_num}_{index_id}.idx
-    return fmt::format("{}/{}_{}_{}.idx", rowset_dir, rowset_id.to_string(), 
segment_id, index_id);
+    return fmt::format("{}/{}_{}_{}{}.idx", rowset_dir, rowset_id.to_string(), 
segment_id, index_id,
+                       index_suffix_path);
 }
 
 std::string InvertedIndexDescriptor::local_inverted_index_path_segcompacted(
         const string& tablet_path, const RowsetId& rowset_id, int64_t begin, 
int64_t end,
-        int64_t index_id) {
+        int64_t index_id, const std::string& index_suffix_path) {
     // 
{root_path}/data/{shard_id}/{tablet_id}/{schema_hash}/{rowset_id}_{begin_seg}-{end_seg}_{index_id}.idx
-    return fmt::format("{}/{}_{}-{}_{}.idx", tablet_path, 
rowset_id.to_string(), begin, end,
-                       index_id);
+    return fmt::format("{}/{}_{}-{}_{}{}.idx", tablet_path, 
rowset_id.to_string(), begin, end,

Review Comment:
   add a index_name_separator '_' between id and suffix to be more clear



##########
be/src/vec/data_types/data_type_object.cpp:
##########
@@ -47,10 +47,7 @@ namespace doris::vectorized {
 DataTypeObject::DataTypeObject(const String& schema_format_, bool is_nullable_)
         : schema_format(to_lower(schema_format_)), is_nullable(is_nullable_) {}
 bool DataTypeObject::equals(const IDataType& rhs) const {
-    if (const auto* object = typeid_cast<const DataTypeObject*>(&rhs)) {
-        return schema_format == object->schema_format && is_nullable == 
object->is_nullable;
-    }
-    return false;
+    return typeid_cast<const DataTypeObject*>(&rhs) != nullptr;

Review Comment:
   just check is not nullptr?



##########
be/src/olap/rowset/segment_creator.h:
##########
@@ -142,7 +145,7 @@ class SegmentFlusher {
                                  int64_t* flush_size = nullptr);
 
 private:
-    RowsetWriterContext _context;
+    RowsetWriterContext* _context;

Review Comment:
   Why change it from object to pointer?



##########
be/src/olap/tablet_schema.cpp:
##########
@@ -1030,8 +1166,12 @@ bool TabletSchema::has_inverted_index(int32_t 
col_unique_id) const {
     return false;
 }
 
-bool TabletSchema::has_inverted_index_with_index_id(int32_t index_id) const {
+bool TabletSchema::has_inverted_index_with_index_id(int32_t index_id,
+                                                    const std::string& 
suffix_name) const {
     for (size_t i = 0; i < _indexes.size(); i++) {
+        if (_indexes[i].get_escaped_index_suffix_path() != suffix_name) {

Review Comment:
   put it into the next if branch



##########
be/src/olap/tablet_schema.cpp:
##########
@@ -1015,9 +1145,15 @@ std::vector<const TabletIndex*> 
TabletSchema::get_indexes_for_column(int32_t col
     return indexes_for_column;
 }
 
-bool TabletSchema::has_inverted_index(int32_t col_unique_id) const {
+bool TabletSchema::has_inverted_index(const TabletColumn& col) const {
     // TODO use more efficient impl
+    int32_t col_unique_id = col.unique_id();
+    const std::string& suffix_path =
+            !col.path_info().empty() ? 
escape_for_path_name(col.path_info().get_path()) : "";
     for (size_t i = 0; i < _indexes.size(); i++) {
+        if (_indexes[i].get_escaped_index_suffix_path() != suffix_path) {

Review Comment:
   put it into the next if branch since string op is cost.



##########
be/src/olap/memtable.cpp:
##########
@@ -505,4 +505,4 @@ std::unique_ptr<vectorized::Block> MemTable::to_block() {
     return vectorized::Block::create_unique(_output_mutable_block.to_block());
 }
 
-} // namespace doris
+} // namespace doris

Review Comment:
   useless change



##########
be/src/olap/rowset/segment_v2/inverted_index_desc.cpp:
##########
@@ -27,30 +27,32 @@ const std::string segment_suffix = ".dat";
 const std::string index_suffix = ".idx";
 const std::string index_name_separator = "_";
 
-std::string InvertedIndexDescriptor::get_temporary_index_path(const 
std::string& segment_path,
-                                                              uint32_t uuid) {
+std::string InvertedIndexDescriptor::get_temporary_index_path(
+        const std::string& segment_path, uint32_t uuid, const std::string& 
index_suffix_path) {
     return StripSuffixString(segment_path, segment_suffix) + 
index_name_separator +
-           std::to_string(uuid);
+           std::to_string(uuid) + index_suffix_path;

Review Comment:
   add a index_name_separator '_' between id and suffix to be more clear



##########
be/src/olap/rowset/segment_creator.cpp:
##########
@@ -40,32 +49,178 @@ SegmentFlusher::SegmentFlusher() = default;
 
 SegmentFlusher::~SegmentFlusher() = default;
 
-Status SegmentFlusher::init(const RowsetWriterContext& rowset_writer_context) {
-    _context = rowset_writer_context;
+Status SegmentFlusher::init(RowsetWriterContext& rowset_writer_context) {
+    _context = &rowset_writer_context;
     return Status::OK();
 }
 
 Status SegmentFlusher::flush_single_block(const vectorized::Block* block, 
int32_t segment_id,
-                                          int64_t* flush_size, 
TabletSchemaSPtr flush_schema) {
+                                          int64_t* flush_size) {
     if (block->rows() == 0) {
         return Status::OK();
     }
-    bool no_compression = block->bytes() <= 
config::segment_compression_threshold_kb * 1024;
+    TabletSchemaSPtr flush_schema = nullptr;
+    // Expand variant columns
+    vectorized::Block flush_block(*block);
+    if (_context->write_type != DataWriteType::TYPE_COMPACTION &&
+        _context->tablet_schema->num_variant_columns() > 0) {
+        RETURN_IF_ERROR(_expand_variant_to_subcolumns(flush_block, 
flush_schema));
+    }
+    bool no_compression = flush_block.bytes() <= 
config::segment_compression_threshold_kb * 1024;
     if (config::enable_vertical_segment_writer &&
-        _context.tablet_schema->cluster_key_idxes().empty()) {
+        _context->tablet_schema->cluster_key_idxes().empty()) {
         std::unique_ptr<segment_v2::VerticalSegmentWriter> writer;
         RETURN_IF_ERROR(_create_segment_writer(writer, segment_id, 
no_compression, flush_schema));
-        RETURN_IF_ERROR(_add_rows(writer, block, 0, block->rows()));
+        RETURN_IF_ERROR(_add_rows(writer, &flush_block, 0, 
flush_block.rows()));
         RETURN_IF_ERROR(_flush_segment_writer(writer, flush_size));
     } else {
         std::unique_ptr<segment_v2::SegmentWriter> writer;
         RETURN_IF_ERROR(_create_segment_writer(writer, segment_id, 
no_compression, flush_schema));
-        RETURN_IF_ERROR(_add_rows(writer, block, 0, block->rows()));
+        RETURN_IF_ERROR(_add_rows(writer, &flush_block, 0, 
flush_block.rows()));
         RETURN_IF_ERROR(_flush_segment_writer(writer, flush_size));
     }
     return Status::OK();
 }
 
+Status SegmentFlusher::_expand_variant_to_subcolumns(vectorized::Block& block,
+                                                     TabletSchemaSPtr& 
flush_schema) {
+    size_t num_rows = block.rows();
+    if (num_rows == 0) {
+        return Status::OK();
+    }
+
+    std::vector<int> variant_column_pos;
+    if (_context->partial_update_info && 
_context->partial_update_info->is_partial_update) {
+        // check columns that used to do partial updates should not include 
variant
+        for (int i : _context->partial_update_info->update_cids) {
+            const auto& col = _context->tablet_schema->columns()[i];
+            if (!col.is_key() && col.name() != DELETE_SIGN) {
+                return Status::InvalidArgument(
+                        "Not implement partial update for variant only support 
delete currently");
+            }
+        }
+    } else {
+        for (int i = 0; i < _context->tablet_schema->columns().size(); ++i) {
+            if (_context->tablet_schema->columns()[i].is_variant_type()) {
+                variant_column_pos.push_back(i);
+            }
+        }
+    }
+
+    if (variant_column_pos.empty()) {
+        return Status::OK();
+    }
+
+    try {
+        // Parse each variant column from raw string column
+        vectorized::schema_util::parse_variant_columns(block, 
variant_column_pos);
+        vectorized::schema_util::finalize_variant_columns(block, 
variant_column_pos,
+                                                          false /*not ingore 
sparse*/);
+        vectorized::schema_util::encode_variant_sparse_subcolumns(block, 
variant_column_pos);

Review Comment:
   The 3 functions parse_variant_columns, finalize_variant_columns and 
encode_variant_sparse_subcolumns are only used here. We can encapsulate them in 
one function to hide the detail to caller.



##########
be/src/vec/common/schema_util.cpp:
##########
@@ -289,70 +303,140 @@ void update_least_common_schema(const 
std::vector<TabletSchemaSPtr>& schemas,
         TabletColumn common_column;
         // const std::string& column_name = variant_col_name + "." + 
tuple_paths[i].get_path();
         get_column_by_type(tuple_types[i], tuple_paths[i].get_path(), 
common_column,
-                           ExtraInfo {.unique_id = -1,
+                           ExtraInfo {.unique_id = variant_col_unique_id,
                                       .parent_unique_id = 
variant_col_unique_id,
                                       .path_info = tuple_paths[i]});
         common_schema->append_column(common_column);
     }
 }
 
-void get_least_common_schema(const std::vector<TabletSchemaSPtr>& schemas,
-                             TabletSchemaSPtr& common_schema) {
-    // Pick tablet schema with max schema version
-    const TabletSchemaSPtr base_schema =
-            *std::max_element(schemas.cbegin(), schemas.cend(),
-                              [](const TabletSchemaSPtr a, const 
TabletSchemaSPtr b) {
-                                  return a->schema_version() < 
b->schema_version();
-                              });
-    CHECK(base_schema);
-    CHECK(common_schema);
-    common_schema->copy_from(*base_schema);
-    // Merge columns from other schemas
-    common_schema->clear_columns();
-    std::vector<int32_t> variant_column_unique_id;
-    // Get all columns without extracted columns and collect variant col 
unique id
-    for (const TabletColumn& col : base_schema->columns()) {
-        if (col.is_variant_type()) {
-            variant_column_unique_id.push_back(col.unique_id());
+void inherit_tablet_index(TabletSchemaSPtr& schema) {
+    std::unordered_map<int32_t, TabletIndex> variants_index_meta;
+    // Get all variants tablet index metas if exist
+    for (const auto& col : schema->columns()) {
+        auto index_meta = schema->get_inverted_index(col.unique_id(), "");
+        if (col.is_variant_type() && index_meta != nullptr) {
+            variants_index_meta.emplace(col.unique_id(), *index_meta);
         }
+    }
+
+    // Add index meta if extracted column is missing index meta
+    for (const auto& col : schema->columns()) {
         if (!col.is_extracted_column()) {
-            common_schema->append_column(col);
+            continue;
+        }
+        auto it = variants_index_meta.find(col.parent_unique_id());
+        // variant has no index meta, ignore
+        if (it == variants_index_meta.end()) {
+            continue;
+        }
+        auto index_meta = schema->get_inverted_index(col);
+        // add index meta
+        TabletIndex index_info = it->second;
+        
index_info.set_escaped_escaped_index_suffix_path(col.path_info().get_path());
+        if (index_meta != nullptr) {
+            // already exist
+            schema->update_index(col, index_info);

Review Comment:
   What's updated for the new index_info?



##########
be/src/olap/rowset/segment_v2/inverted_index_desc.cpp:
##########
@@ -27,30 +27,32 @@ const std::string segment_suffix = ".dat";
 const std::string index_suffix = ".idx";
 const std::string index_name_separator = "_";
 
-std::string InvertedIndexDescriptor::get_temporary_index_path(const 
std::string& segment_path,
-                                                              uint32_t uuid) {
+std::string InvertedIndexDescriptor::get_temporary_index_path(
+        const std::string& segment_path, uint32_t uuid, const std::string& 
index_suffix_path) {
     return StripSuffixString(segment_path, segment_suffix) + 
index_name_separator +
-           std::to_string(uuid);
+           std::to_string(uuid) + index_suffix_path;
 }
 
 std::string InvertedIndexDescriptor::get_index_file_name(const std::string& 
segment_path,
-                                                         uint32_t uuid) {
+                                                         uint32_t uuid,
+                                                         const std::string& 
index_suffix_path) {
     return StripSuffixString(segment_path, segment_suffix) + 
index_name_separator +
-           std::to_string(uuid) + index_suffix;
+           std::to_string(uuid) + index_suffix_path + index_suffix;
 }
 
-std::string InvertedIndexDescriptor::inverted_index_file_path(const string& 
rowset_dir,
-                                                              const RowsetId& 
rowset_id,
-                                                              int segment_id, 
int64_t index_id) {
+std::string InvertedIndexDescriptor::inverted_index_file_path(
+        const string& rowset_dir, const RowsetId& rowset_id, int segment_id, 
int64_t index_id,
+        const std::string& index_suffix_path) {
     // {rowset_dir}/{schema_hash}/{rowset_id}_{seg_num}_{index_id}.idx
-    return fmt::format("{}/{}_{}_{}.idx", rowset_dir, rowset_id.to_string(), 
segment_id, index_id);
+    return fmt::format("{}/{}_{}_{}{}.idx", rowset_dir, rowset_id.to_string(), 
segment_id, index_id,

Review Comment:
   add a index_name_separator '_' between id and suffix to be more clear



##########
be/src/olap/rowset/segment_v2/inverted_index_desc.cpp:
##########
@@ -27,30 +27,32 @@ const std::string segment_suffix = ".dat";
 const std::string index_suffix = ".idx";
 const std::string index_name_separator = "_";
 
-std::string InvertedIndexDescriptor::get_temporary_index_path(const 
std::string& segment_path,
-                                                              uint32_t uuid) {
+std::string InvertedIndexDescriptor::get_temporary_index_path(
+        const std::string& segment_path, uint32_t uuid, const std::string& 
index_suffix_path) {
     return StripSuffixString(segment_path, segment_suffix) + 
index_name_separator +
-           std::to_string(uuid);
+           std::to_string(uuid) + index_suffix_path;
 }
 
 std::string InvertedIndexDescriptor::get_index_file_name(const std::string& 
segment_path,
-                                                         uint32_t uuid) {
+                                                         uint32_t uuid,
+                                                         const std::string& 
index_suffix_path) {
     return StripSuffixString(segment_path, segment_suffix) + 
index_name_separator +
-           std::to_string(uuid) + index_suffix;
+           std::to_string(uuid) + index_suffix_path + index_suffix;

Review Comment:
   add a index_name_separator '_' between id and suffix to be more clear
   
   



##########
be/src/olap/rowset/beta_rowset_writer.cpp:
##########
@@ -445,15 +453,9 @@ Status BetaRowsetWriter::flush_memtable(vectorized::Block* 
block, int32_t segmen
         return Status::OK();
     }
 
-    TabletSchemaSPtr flush_schema;
-    if (_context.tablet_schema->num_variant_columns() > 0) {
-        // Unfold variant column
-        RETURN_IF_ERROR(expand_variant_to_subcolumns(*block, flush_schema));

Review Comment:
   Why move expand_variant_to_subcolumns to _segment_creator.flush_single_block?



##########
be/src/olap/schema_change.h:
##########
@@ -119,7 +119,7 @@ class SchemaChange {
                 _inner_process(rowset_reader, rowset_writer, new_tablet, 
base_tablet_schema));
         _add_filtered_rows(rowset_reader->filtered_rows());
 
-        // Check row num changes
+        // check row num changes

Review Comment:
   useless change



##########
be/src/olap/schema.cpp:
##########
@@ -130,10 +131,11 @@ vectorized::IColumn::MutablePtr 
Schema::get_column_by_field(const Field& field)
     return get_data_type_ptr(field)->create_column();
 }
 
-vectorized::IColumn::MutablePtr Schema::get_predicate_column_ptr(const Field& 
field,
+vectorized::IColumn::MutablePtr Schema::get_predicate_column_ptr(const 
FieldType& type,
+                                                                 bool 
is_nullable,

Review Comment:
   Why change Field to FieldType + is_nullable?



##########
be/src/olap/column_predicate.h:
##########
@@ -210,6 +212,11 @@ class ColumnPredicate {
 
     virtual bool can_do_bloom_filter(bool ngram) const { return false; }
 
+    // Check input type could apply safely.
+    // Note: Currenly ColumnPredicate is not include complex type, so use 
PrimitiveType
+    // is simple and intuitive
+    virtual bool can_do_apply_safely(PrimitiveType input_type, bool is_null) 
const = 0;

Review Comment:
   We can define a base impl for can_do_apply_safely since it's the same for 
most subclass predicate.



##########
be/src/olap/tablet_schema.h:
##########
@@ -207,16 +211,15 @@ class TabletIndex {
 
         return 0;
     }
-    TabletIndex(const TabletIndex& other) {

Review Comment:
   Why delete ctor?



##########
be/src/vec/data_types/serde/data_type_jsonb_serde.cpp:
##########
@@ -199,7 +199,7 @@ void DataTypeJsonbSerDe::write_one_cell_to_json(const 
IColumn& column, rapidjson
     auto& data = assert_cast<const ColumnString&>(column);
     const auto jsonb_val = data.get_data_at(row_num);
     if (jsonb_val.empty()) {
-        result.SetNull();
+        return;

Review Comment:
   @amorynan , can you have a look



##########
be/src/olap/match_predicate.cpp:
##########
@@ -39,34 +44,37 @@ PredicateType MatchPredicate::type() const {
     return PredicateType::MATCH;
 }
 
-Status MatchPredicate::evaluate(const Schema& schema, InvertedIndexIterator* 
iterator,
-                                uint32_t num_rows, roaring::Roaring* bitmap) 
const {
+Status MatchPredicate::evaluate(const vectorized::NameAndTypePair& 
name_with_type,
+                                InvertedIndexIterator* iterator, uint32_t 
num_rows,
+                                roaring::Roaring* bitmap) const {
     if (iterator == nullptr) {
         return Status::OK();
     }
     if (_skip_evaluate(iterator)) {
         return Status::Error<ErrorCode::INVERTED_INDEX_EVALUATE_SKIPPED>(
                 "match predicate evaluate skipped.");
     }
-    auto column_desc = schema.column(_column_id);
+    auto type = name_with_type.second;
+    const std::string& name = name_with_type.first;
     roaring::Roaring roaring;
     auto inverted_index_query_type = 
_to_inverted_index_query_type(_match_type);
-
-    if (is_string_type(column_desc->type()) ||
-        (column_desc->type() == FieldType::OLAP_FIELD_TYPE_ARRAY &&
-         is_string_type(column_desc->get_sub_field(0)->type_info()->type()))) {
+    TypeDescriptor column_desc = type->get_type_as_type_descriptor();
+    if (is_string_type(column_desc.type)) {

Review Comment:
   Why is Array<String> ignored here?



##########
be/src/olap/tablet_schema.h:
##########
@@ -207,16 +211,15 @@ class TabletIndex {
 
         return 0;
     }
-    TabletIndex(const TabletIndex& other) {
-        _index_id = other._index_id;
-        _index_name = other._index_name;
-        _index_type = other._index_type;
-        _col_unique_ids = other._col_unique_ids;
-        _properties = other._properties;
-    }
+
+    const std::string& get_escaped_index_suffix_path() const { return 
_escaped_index_suffix_path; }

Review Comment:
   get_index_suffix may be better, since path is confusing to index file path



##########
be/src/vec/columns/column_object.cpp:
##########
@@ -436,8 +436,7 @@ ColumnPtr ColumnObject::index(const IColumn& indexes, 
size_t limit) const {
 }
 
 bool ColumnObject::Subcolumn::check_if_sparse_column(size_t num_rows) {
-    constexpr static size_t s_threshold_rows_estimate_sparse_column = 1000;
-    if (num_rows < s_threshold_rows_estimate_sparse_column) {
+    if (num_rows < config::variant_threshold_rows_to_estimate_sparse_column) {

Review Comment:
   If there are many columns in the 
variant_threshold_rows_to_estimate_sparse_column rows, will it cause problem.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Re: [PR] [Feature-Variant](Variant Type) support variant type query and index [doris]

Reply via email to