This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 051298381a0 [opt](index compaction)Optimize logic of picking columns 
for index compaction (#42051)
051298381a0 is described below

commit 051298381a052f1c8ef9782b802e2c264f919424
Author: qiye <jianliang5...@gmail.com>
AuthorDate: Fri Oct 18 19:41:40 2024 +0800

    [opt](index compaction)Optimize logic of picking columns for index 
compaction (#42051)
    
    ## Proposed changes
    
    1. Rename `skip_inverted_index` to `columns_to_do_index_compaction` to
    make it more clear.
    2. Remove redundant rowid_conversion check in index compaction
    3. Rename function `construct_skip_inverted_index` to
    `construct_index_compaction_columns`
    4. Check `index.col_unique_ids` empty before using to avoid crash and
    log tablet schema for debug.
    5. Avoid unnecessary bkd and variant index checks by advancing column
    type checking before `has_inverted_index`
---
 be/src/olap/compaction.cpp                         | 73 ++++++++--------------
 be/src/olap/compaction.h                           |  2 +-
 be/src/olap/rowset/rowset_writer_context.h         |  4 +-
 be/src/olap/rowset/segment_v2/segment_writer.cpp   |  5 +-
 .../rowset/segment_v2/vertical_segment_writer.cpp  |  5 +-
 5 files changed, 35 insertions(+), 54 deletions(-)

diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp
index 84830502366..cea825e15e1 100644
--- a/be/src/olap/compaction.cpp
+++ b/be/src/olap/compaction.cpp
@@ -47,6 +47,7 @@
 #include "olap/cumulative_compaction_policy.h"
 #include "olap/cumulative_compaction_time_series_policy.h"
 #include "olap/data_dir.h"
+#include "olap/olap_common.h"
 #include "olap/olap_define.h"
 #include "olap/rowset/beta_rowset.h"
 #include "olap/rowset/beta_rowset_writer.h"
@@ -174,10 +175,11 @@ Status Compaction::merge_input_rowsets() {
 
     // write merged rows to output rowset
     // The test results show that merger is low-memory-footprint, there is no 
need to tracker its mem pool
-    // if ctx.skip_inverted_index.size() > 0, it means we need to do inverted 
index compaction.
+    // if ctx.columns_to_do_index_compaction.size() > 0, it means we need to 
do inverted index compaction.
     // the row ID conversion matrix needs to be used for inverted index 
compaction.
-    if (!ctx.skip_inverted_index.empty() || (_tablet->keys_type() == 
KeysType::UNIQUE_KEYS &&
-                                             
_tablet->enable_unique_key_merge_on_write())) {
+    if (!ctx.columns_to_do_index_compaction.empty() ||
+        (_tablet->keys_type() == KeysType::UNIQUE_KEYS &&
+         _tablet->enable_unique_key_merge_on_write())) {
         _stats.rowid_conversion = _rowid_conversion.get();
     }
 
@@ -482,46 +484,12 @@ Status CompactionMixin::execute_compact_impl(int64_t 
permits) {
 Status Compaction::do_inverted_index_compaction() {
     const auto& ctx = _output_rs_writer->context();
     if (!config::inverted_index_compaction_enable || _input_row_num <= 0 ||
-        !_stats.rowid_conversion || ctx.skip_inverted_index.empty()) {
+        !_stats.rowid_conversion || 
ctx.columns_to_do_index_compaction.empty()) {
         return Status::OK();
     }
 
     OlapStopWatch inverted_watch;
 
-    int64_t cur_max_version = 0;
-    {
-        std::shared_lock rlock(_tablet->get_header_lock());
-        cur_max_version = _tablet->max_version_unlocked();
-    }
-
-    DeleteBitmap output_rowset_delete_bitmap(_tablet->tablet_id());
-    std::set<RowLocation> missed_rows;
-    std::map<RowsetSharedPtr, std::list<std::pair<RowLocation, RowLocation>>> 
location_map;
-    // Convert the delete bitmap of the input rowsets to output rowset.
-    _tablet->calc_compaction_output_rowset_delete_bitmap(
-            _input_rowsets, *_rowid_conversion, 0, cur_max_version + 1, 
&missed_rows, &location_map,
-            _tablet->tablet_meta()->delete_bitmap(), 
&output_rowset_delete_bitmap);
-
-    if (!_allow_delete_in_cumu_compaction) {
-        if (compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION &&
-            _stats.merged_rows != missed_rows.size() && 
_tablet->tablet_state() == TABLET_RUNNING) {
-            std::string err_msg = fmt::format(
-                    "cumulative compaction: the merged rows({}) is not equal 
to missed "
-                    "rows({}) in rowid conversion, tablet_id: {}, table_id:{}",
-                    _stats.merged_rows, missed_rows.size(), 
_tablet->tablet_id(),
-                    _tablet->table_id());
-            if (config::enable_mow_compaction_correctness_check_core) {
-                CHECK(false) << err_msg;
-            } else {
-                DCHECK(false) << err_msg;
-            }
-            // log here just for debugging, do not return error
-            LOG(WARNING) << err_msg;
-        }
-    }
-
-    RETURN_IF_ERROR(_tablet->check_rowid_conversion(_output_rowset, 
location_map));
-
     // translation vec
     // <<dest_idx_num, dest_docId>>
     // the first level vector: index indicates src segment.
@@ -718,7 +686,7 @@ Status Compaction::do_inverted_index_compaction() {
     };
 
     Status status = Status::OK();
-    for (auto&& column_uniq_id : ctx.skip_inverted_index) {
+    for (auto&& column_uniq_id : ctx.columns_to_do_index_compaction) {
         auto col = _cur_tablet_schema->column_by_uid(column_uniq_id);
         const auto* index_meta = _cur_tablet_schema->get_inverted_index(col);
 
@@ -809,13 +777,25 @@ Status Compaction::do_inverted_index_compaction() {
     return Status::OK();
 }
 
-void Compaction::construct_skip_inverted_index(RowsetWriterContext& ctx) {
+void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) {
     for (const auto& index : _cur_tablet_schema->indexes()) {
         if (index.index_type() != IndexType::INVERTED) {
             continue;
         }
 
-        auto col_unique_id = index.col_unique_ids()[0];
+        auto col_unique_ids = index.col_unique_ids();
+        // check if column unique ids is empty to avoid crash
+        if (col_unique_ids.empty()) {
+            LOG(WARNING) << "tablet[" << _tablet->tablet_id() << "] index[" << 
index.index_id()
+                         << "] has no column unique id, will skip index 
compaction."
+                         << " tablet_schema=" << 
_cur_tablet_schema->dump_full_schema();
+            continue;
+        }
+        auto col_unique_id = col_unique_ids[0];
+        // Avoid doing inverted index compaction on non-slice type columns
+        if 
(!field_is_slice_type(_cur_tablet_schema->column_by_uid(col_unique_id).type())) 
{
+            continue;
+        }
         auto has_inverted_index = [&](const RowsetSharedPtr& src_rs) {
             auto* rowset = static_cast<BetaRowset*>(src_rs.get());
             if (rowset->is_skip_index_compaction(col_unique_id)) {
@@ -877,7 +857,7 @@ void 
Compaction::construct_skip_inverted_index(RowsetWriterContext& ctx) {
                     reader->close();
 
                     // why is 3?
-                    // bkd index will write at least 3 files
+                    // slice type index file at least has 3 files: 
null_bitmap, segments_N, segments.gen
                     if (files.size() < 3) {
                         LOG(WARNING) << "tablet[" << _tablet->tablet_id() << 
"] column_unique_id["
                                      << col_unique_id << "]," << 
index_file_path
@@ -897,9 +877,8 @@ void 
Compaction::construct_skip_inverted_index(RowsetWriterContext& ctx) {
         bool all_have_inverted_index = std::all_of(_input_rowsets.begin(), 
_input_rowsets.end(),
                                                    
std::move(has_inverted_index));
 
-        if (all_have_inverted_index &&
-            
field_is_slice_type(_cur_tablet_schema->column_by_uid(col_unique_id).type())) {
-            ctx.skip_inverted_index.insert(col_unique_id);
+        if (all_have_inverted_index) {
+            ctx.columns_to_do_index_compaction.insert(col_unique_id);
         }
     }
 }
@@ -912,7 +891,7 @@ Status 
CompactionMixin::construct_output_rowset_writer(RowsetWriterContext& ctx)
           _tablet->keys_type() == KeysType::DUP_KEYS)) &&
         _cur_tablet_schema->get_inverted_index_storage_format() ==
                 InvertedIndexStorageFormatPB::V1) {
-        construct_skip_inverted_index(ctx);
+        construct_index_compaction_columns(ctx);
     }
     ctx.version = _output_version;
     ctx.rowset_state = VISIBLE;
@@ -1209,7 +1188,7 @@ Status 
CloudCompactionMixin::construct_output_rowset_writer(RowsetWriterContext&
           _tablet->keys_type() == KeysType::DUP_KEYS)) &&
         _cur_tablet_schema->get_inverted_index_storage_format() ==
                 InvertedIndexStorageFormatPB::V1) {
-        construct_skip_inverted_index(ctx);
+        construct_index_compaction_columns(ctx);
     }
 
     // Use the storage resource of the previous rowset
diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h
index 38d50595ca8..08afe840280 100644
--- a/be/src/olap/compaction.h
+++ b/be/src/olap/compaction.h
@@ -69,7 +69,7 @@ protected:
 
     Status do_inverted_index_compaction();
 
-    void construct_skip_inverted_index(RowsetWriterContext& ctx);
+    void construct_index_compaction_columns(RowsetWriterContext& ctx);
 
     virtual Status construct_output_rowset_writer(RowsetWriterContext& ctx) = 
0;
 
diff --git a/be/src/olap/rowset/rowset_writer_context.h 
b/be/src/olap/rowset/rowset_writer_context.h
index e13f7efe6e9..cb0fda83e60 100644
--- a/be/src/olap/rowset/rowset_writer_context.h
+++ b/be/src/olap/rowset/rowset_writer_context.h
@@ -79,8 +79,8 @@ struct RowsetWriterContext {
 
     int64_t newest_write_timestamp = -1;
     bool enable_unique_key_merge_on_write = false;
-    // store column_unique_id to skip write inverted index
-    std::set<int32_t> skip_inverted_index;
+    // store column_unique_id to do index compaction
+    std::set<int32_t> columns_to_do_index_compaction;
     DataWriteType write_type = DataWriteType::TYPE_DEFAULT;
     BaseTabletSPtr tablet = nullptr;
 
diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/segment_writer.cpp
index c21021eb9d2..ed739fefcc1 100644
--- a/be/src/olap/rowset/segment_v2/segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp
@@ -209,8 +209,9 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, 
const TabletColumn& co
     opts.need_bitmap_index = column.has_bitmap_index();
     bool skip_inverted_index = false;
     if (_opts.rowset_ctx != nullptr) {
-        // skip write inverted index for index compaction
-        skip_inverted_index = 
_opts.rowset_ctx->skip_inverted_index.count(column.unique_id()) > 0;
+        // skip write inverted index for index compaction column
+        skip_inverted_index =
+                
_opts.rowset_ctx->columns_to_do_index_compaction.count(column.unique_id()) > 0;
     }
     // skip write inverted index on load if skip_write_index_on_load is true
     if (_opts.write_type == DataWriteType::TYPE_DIRECT && 
schema->skip_write_index_on_load()) {
diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp 
b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
index 8f7f614d7b3..ce033cdd002 100644
--- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp
@@ -202,8 +202,9 @@ Status 
VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletCo
     opts.need_bitmap_index = column.has_bitmap_index();
     bool skip_inverted_index = false;
     if (_opts.rowset_ctx != nullptr) {
-        // skip write inverted index for index compaction
-        skip_inverted_index = 
_opts.rowset_ctx->skip_inverted_index.contains(column.unique_id());
+        // skip write inverted index for index compaction column
+        skip_inverted_index =
+                
_opts.rowset_ctx->columns_to_do_index_compaction.contains(column.unique_id());
     }
     // skip write inverted index on load if skip_write_index_on_load is true
     if (_opts.write_type == DataWriteType::TYPE_DIRECT &&


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to