This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new e174ddceda2 branch-3.0: [fix](index size) discard index size when meta 
size is invalid #46549 (#46717)
e174ddceda2 is described below

commit e174ddceda2175d67441e65a2a14f310b79fd674
Author: airborne12 <jiang...@selectdb.com>
AuthorDate: Mon Jan 13 14:32:12 2025 +0800

    branch-3.0: [fix](index size) discard index size when meta size is invalid 
#46549 (#46717)
    
    cherry pick from #46549
---
 be/src/olap/compaction.cpp                    |  22 +++-
 be/src/olap/rowset/beta_rowset.cpp            |   2 +-
 be/src/olap/rowset/beta_rowset.h              |   2 +-
 be/src/olap/rowset/beta_rowset_writer.cpp     |  23 ++++-
 be/src/olap/rowset/rowset.h                   |   8 +-
 be/src/olap/rowset/rowset_meta.h              |  12 +--
 be/src/olap/task/index_builder.cpp            |   2 +-
 be/test/olap/ordered_data_compaction_test.cpp | 138 +++++++++++++++++++++++++-
 be/test/testutil/mock_rowset.h                |   4 +
 9 files changed, 195 insertions(+), 18 deletions(-)

diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp
index 5f490e99034..c507cc17a27 100644
--- a/be/src/olap/compaction.cpp
+++ b/be/src/olap/compaction.cpp
@@ -315,9 +315,25 @@ Status CompactionMixin::do_compact_ordered_rowsets() {
 
 void CompactionMixin::build_basic_info() {
     for (auto& rowset : _input_rowsets) {
-        _input_rowsets_data_size += rowset->data_disk_size();
-        _input_rowsets_index_size += rowset->index_disk_size();
-        _input_rowsets_total_size += rowset->total_disk_size();
+        const auto& rowset_meta = rowset->rowset_meta();
+        auto index_size = rowset_meta->index_disk_size();
+        auto total_size = rowset_meta->total_disk_size();
+        auto data_size = rowset_meta->data_disk_size();
+        // corrupted index size caused by bug before 2.1.5 or 3.0.0 version
+        // try to get real index size from disk.
+        if (index_size < 0 || index_size > total_size * 2) {
+            LOG(ERROR) << "invalid index size:" << index_size << " total 
size:" << total_size
+                       << " data size:" << data_size << " tablet:" << 
rowset_meta->tablet_id()
+                       << " rowset:" << rowset_meta->rowset_id();
+            index_size = 0;
+            auto st = rowset->get_inverted_index_size(&index_size);
+            if (!st.ok()) {
+                LOG(ERROR) << "failed to get inverted index size. res=" << st;
+            }
+        }
+        _input_rowsets_data_size += data_size;
+        _input_rowsets_index_size += index_size;
+        _input_rowsets_total_size += total_size;
         _input_row_num += rowset->num_rows();
         _input_num_segments += rowset->num_segments();
     }
diff --git a/be/src/olap/rowset/beta_rowset.cpp 
b/be/src/olap/rowset/beta_rowset.cpp
index b83661238b8..9f33f363e99 100644
--- a/be/src/olap/rowset/beta_rowset.cpp
+++ b/be/src/olap/rowset/beta_rowset.cpp
@@ -74,7 +74,7 @@ Status BetaRowset::do_load(bool /*use_cache*/) {
     return Status::OK();
 }
 
-Status BetaRowset::get_inverted_index_size(size_t* index_size) {
+Status BetaRowset::get_inverted_index_size(int64_t* index_size) {
     const auto& fs = _rowset_meta->fs();
     if (!fs) {
         return Status::Error<INIT_FAILED>("get fs failed, resource_id={}",
diff --git a/be/src/olap/rowset/beta_rowset.h b/be/src/olap/rowset/beta_rowset.h
index 52d5ac5c8a8..0b22d122741 100644
--- a/be/src/olap/rowset/beta_rowset.h
+++ b/be/src/olap/rowset/beta_rowset.h
@@ -80,7 +80,7 @@ public:
 
     Status get_segments_size(std::vector<size_t>* segments_size);
 
-    Status get_inverted_index_size(size_t* index_size);
+    Status get_inverted_index_size(int64_t* index_size) override;
 
     [[nodiscard]] virtual Status add_to_binlog() override;
 
diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp 
b/be/src/olap/rowset/beta_rowset_writer.cpp
index c25a1532d48..5158c28b0cf 100644
--- a/be/src/olap/rowset/beta_rowset_writer.cpp
+++ b/be/src/olap/rowset/beta_rowset_writer.cpp
@@ -652,8 +652,27 @@ Status BaseBetaRowsetWriter::add_rowset(RowsetSharedPtr 
rowset) {
     assert(rowset->rowset_meta()->rowset_type() == BETA_ROWSET);
     RETURN_IF_ERROR(rowset->link_files_to(_context.tablet_path, 
_context.rowset_id));
     _num_rows_written += rowset->num_rows();
-    _total_data_size += rowset->rowset_meta()->data_disk_size();
-    _total_index_size += rowset->rowset_meta()->index_disk_size();
+    const auto& rowset_meta = rowset->rowset_meta();
+    auto index_size = rowset_meta->index_disk_size();
+    auto total_size = rowset_meta->total_disk_size();
+    auto data_size = rowset_meta->data_disk_size();
+    // corrupted index size caused by bug before 2.1.5 or 3.0.0 version
+    // try to get real index size from disk.
+    if (index_size < 0 || index_size > total_size * 2) {
+        LOG(ERROR) << "invalid index size:" << index_size << " total size:" << 
total_size
+                   << " data size:" << data_size << " tablet:" << 
rowset_meta->tablet_id()
+                   << " rowset:" << rowset_meta->rowset_id();
+        index_size = 0;
+        auto st = rowset->get_inverted_index_size(&index_size);
+        if (!st.ok()) {
+            if (!st.is<NOT_FOUND>()) {
+                LOG(ERROR) << "failed to get inverted index size. res=" << st;
+                return st;
+            }
+        }
+    }
+    _total_data_size += data_size;
+    _total_index_size += index_size;
     _num_segment += static_cast<int32_t>(rowset->num_segments());
     // append key_bounds to current rowset
     
RETURN_IF_ERROR(rowset->get_segments_key_bounds(&_segments_encoded_key_bounds));
diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h
index 98d88ba19f2..be21f29888e 100644
--- a/be/src/olap/rowset/rowset.h
+++ b/be/src/olap/rowset/rowset.h
@@ -147,9 +147,9 @@ public:
     // helper class to access RowsetMeta
     int64_t start_version() const { return rowset_meta()->version().first; }
     int64_t end_version() const { return rowset_meta()->version().second; }
-    size_t index_disk_size() const { return rowset_meta()->index_disk_size(); }
-    size_t data_disk_size() const { return rowset_meta()->data_disk_size(); }
-    size_t total_disk_size() const { return rowset_meta()->total_disk_size(); }
+    int64_t index_disk_size() const { return rowset_meta()->index_disk_size(); 
}
+    int64_t data_disk_size() const { return rowset_meta()->data_disk_size(); }
+    int64_t total_disk_size() const { return rowset_meta()->total_disk_size(); 
}
     bool empty() const { return rowset_meta()->empty(); }
     bool zero_num_rows() const { return rowset_meta()->num_rows() == 0; }
     size_t num_rows() const { return rowset_meta()->num_rows(); }
@@ -210,6 +210,8 @@ public:
                                  size_t new_rowset_start_seg_id = 0,
                                  std::set<int64_t>* without_index_uids = 
nullptr) = 0;
 
+    virtual Status get_inverted_index_size(int64_t* index_size) = 0;
+
     // copy all files to `dir`
     virtual Status copy_files_to(const std::string& dir, const RowsetId& 
new_rowset_id) = 0;
 
diff --git a/be/src/olap/rowset/rowset_meta.h b/be/src/olap/rowset/rowset_meta.h
index 46121aeae2b..40e55e07452 100644
--- a/be/src/olap/rowset/rowset_meta.h
+++ b/be/src/olap/rowset/rowset_meta.h
@@ -130,21 +130,21 @@ public:
 
     void set_num_rows(int64_t num_rows) { 
_rowset_meta_pb.set_num_rows(num_rows); }
 
-    size_t total_disk_size() const { return _rowset_meta_pb.total_disk_size(); 
}
+    int64_t total_disk_size() const { return 
_rowset_meta_pb.total_disk_size(); }
 
-    void set_total_disk_size(size_t total_disk_size) {
+    void set_total_disk_size(int64_t total_disk_size) {
         _rowset_meta_pb.set_total_disk_size(total_disk_size);
     }
 
-    size_t data_disk_size() const { return _rowset_meta_pb.data_disk_size(); }
+    int64_t data_disk_size() const { return _rowset_meta_pb.data_disk_size(); }
 
-    void set_data_disk_size(size_t data_disk_size) {
+    void set_data_disk_size(int64_t data_disk_size) {
         _rowset_meta_pb.set_data_disk_size(data_disk_size);
     }
 
-    size_t index_disk_size() const { return _rowset_meta_pb.index_disk_size(); 
}
+    int64_t index_disk_size() const { return 
_rowset_meta_pb.index_disk_size(); }
 
-    void set_index_disk_size(size_t index_disk_size) {
+    void set_index_disk_size(int64_t index_disk_size) {
         _rowset_meta_pb.set_index_disk_size(index_disk_size);
     }
 
diff --git a/be/src/olap/task/index_builder.cpp 
b/be/src/olap/task/index_builder.cpp
index bd17dedbbee..bc677ea6f5c 100644
--- a/be/src/olap/task/index_builder.cpp
+++ b/be/src/olap/task/index_builder.cpp
@@ -81,7 +81,7 @@ Status IndexBuilder::update_inverted_index_info() {
         TabletSchemaSPtr output_rs_tablet_schema = 
std::make_shared<TabletSchema>();
         const auto& input_rs_tablet_schema = input_rowset->tablet_schema();
         output_rs_tablet_schema->copy_from(*input_rs_tablet_schema);
-        size_t total_index_size = 0;
+        int64_t total_index_size = 0;
         auto* beta_rowset = reinterpret_cast<BetaRowset*>(input_rowset.get());
         auto size_st = beta_rowset->get_inverted_index_size(&total_index_size);
         
DBUG_EXECUTE_IF("IndexBuilder::update_inverted_index_info_size_st_not_ok", {
diff --git a/be/test/olap/ordered_data_compaction_test.cpp 
b/be/test/olap/ordered_data_compaction_test.cpp
index 058ed52dd99..934dfbef3ea 100644
--- a/be/test/olap/ordered_data_compaction_test.cpp
+++ b/be/test/olap/ordered_data_compaction_test.cpp
@@ -89,6 +89,15 @@ protected:
         EXPECT_TRUE(io::global_local_filesystem()
                             ->create_directory(absolute_dir + "/tablet_path")
                             .ok());
+        // tmp dir
+        
EXPECT_TRUE(io::global_local_filesystem()->delete_directory(tmp_dir).ok());
+        
EXPECT_TRUE(io::global_local_filesystem()->create_directory(tmp_dir).ok());
+        std::vector<StorePath> paths;
+        paths.emplace_back(std::string(tmp_dir), 1024000000);
+        auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths);
+        st = tmp_file_dirs->init();
+        EXPECT_TRUE(st.ok()) << st.to_json();
+        ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs));
 
         doris::EngineOptions options;
         auto engine = std::make_unique<StorageEngine>(options);
@@ -153,6 +162,62 @@ protected:
         return tablet_schema;
     }
 
+    TabletSchemaSPtr create_inverted_index_v1_schema(KeysType keys_type = 
DUP_KEYS) {
+        TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>();
+        TabletSchemaPB tablet_schema_pb;
+        tablet_schema_pb.set_keys_type(keys_type);
+        tablet_schema_pb.set_num_short_key_columns(1);
+        tablet_schema_pb.set_num_rows_per_row_block(1024);
+        tablet_schema_pb.set_compress_kind(COMPRESS_NONE);
+        tablet_schema_pb.set_next_column_unique_id(4);
+        
tablet_schema_pb.set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V1);
+
+        auto* index_pb = tablet_schema_pb.add_index();
+        index_pb->set_index_id(1);
+        index_pb->set_index_name("c1_index");
+        index_pb->set_index_type(IndexType::INVERTED);
+        index_pb->add_col_unique_id(2);
+
+        ColumnPB* column_1 = tablet_schema_pb.add_column();
+        column_1->set_unique_id(1);
+        column_1->set_name("c1");
+        column_1->set_type("INT");
+        column_1->set_is_key(true);
+        column_1->set_length(4);
+        column_1->set_index_length(4);
+        column_1->set_is_nullable(false);
+        column_1->set_is_bf_column(false);
+
+        ColumnPB* column_2 = tablet_schema_pb.add_column();
+        column_2->set_unique_id(2);
+        column_2->set_name("c2");
+        column_2->set_type("INT");
+        column_2->set_length(4);
+        column_2->set_index_length(4);
+        column_2->set_is_nullable(true);
+        column_2->set_is_key(false);
+        column_2->set_is_nullable(false);
+        column_2->set_is_bf_column(false);
+
+        // unique table must contains the DELETE_SIGN column
+        if (keys_type == UNIQUE_KEYS) {
+            ColumnPB* column_3 = tablet_schema_pb.add_column();
+            column_3->set_unique_id(3);
+            column_3->set_name(DELETE_SIGN);
+            column_3->set_type("TINYINT");
+            column_3->set_length(1);
+            column_3->set_index_length(1);
+            column_3->set_is_nullable(false);
+            column_3->set_is_key(false);
+            column_3->set_is_nullable(false);
+            column_3->set_is_bf_column(false);
+        }
+
+        tablet_schema->init_from_pb(tablet_schema_pb);
+
+        return tablet_schema;
+    }
+
     TabletSchemaSPtr create_agg_schema() {
         TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>();
         TabletSchemaPB tablet_schema_pb;
@@ -401,7 +466,8 @@ protected:
     }
 
 private:
-    const std::string kTestDir = "/ut_dir/vertical_compaction_test";
+    const std::string kTestDir = "/ut_dir/ordered_compaction_test";
+    const std::string tmp_dir = "./ut_dir/ordered_compaction_test/tmp";
     string absolute_dir;
     std::unique_ptr<DataDir> _data_dir;
 };
@@ -487,5 +553,75 @@ TEST_F(OrderedDataCompactionTest, test_01) {
     }
 }
 
+TEST_F(OrderedDataCompactionTest, test_index_disk_size) {
+    auto num_input_rowset = 3;
+    auto num_segments = 2;
+    auto rows_per_segment = 50;
+    std::vector<std::vector<std::vector<std::tuple<int64_t, int64_t>>>> 
input_data;
+    generate_input_data(num_input_rowset, num_segments, rows_per_segment, 
input_data);
+
+    TabletSchemaSPtr tablet_schema = create_inverted_index_v1_schema();
+    TabletSharedPtr tablet = create_tablet(*tablet_schema, false, 10000, 
false);
+    
EXPECT_TRUE(io::global_local_filesystem()->create_directory(tablet->tablet_path()).ok());
+
+    vector<RowsetSharedPtr> input_rowsets;
+    SegmentsOverlapPB new_overlap = NONOVERLAPPING;
+    for (auto i = 0; i < num_input_rowset; i++) {
+        RowsetWriterContext writer_context;
+        create_rowset_writer_context(tablet_schema, tablet->tablet_path(), 
new_overlap, UINT32_MAX,
+                                     &writer_context);
+
+        auto res = RowsetFactory::create_rowset_writer(*engine_ref, 
writer_context, false);
+        EXPECT_TRUE(res.has_value()) << res.error();
+        auto rowset_writer = std::move(res).value();
+
+        uint32_t num_rows = 0;
+        for (int j = 0; j < input_data[i].size(); ++j) {
+            vectorized::Block block = tablet_schema->create_block();
+            auto columns = block.mutate_columns();
+            for (int rid = 0; rid < input_data[i][j].size(); ++rid) {
+                int32_t c1 = std::get<0>(input_data[i][j][rid]);
+                int32_t c2 = std::get<1>(input_data[i][j][rid]);
+                columns[0]->insert_data((const char*)&c1, sizeof(c1));
+                columns[1]->insert_data((const char*)&c2, sizeof(c2));
+
+                if (tablet_schema->keys_type() == UNIQUE_KEYS) {
+                    uint8_t num = 0;
+                    columns[2]->insert_data((const char*)&num, sizeof(num));
+                }
+                num_rows++;
+            }
+            auto s = rowset_writer->add_block(&block);
+            EXPECT_TRUE(s.ok());
+            s = rowset_writer->flush();
+            EXPECT_TRUE(s.ok());
+        }
+
+        RowsetSharedPtr rowset;
+        EXPECT_EQ(Status::OK(), rowset_writer->build(rowset));
+        EXPECT_EQ(input_data[i].size(), rowset->rowset_meta()->num_segments());
+        EXPECT_EQ(num_rows, rowset->rowset_meta()->num_rows());
+
+        // Set random index_disk_size
+        rowset->rowset_meta()->set_index_disk_size(1024000000000000LL);
+        input_rowsets.push_back(rowset);
+    }
+
+    CumulativeCompaction cu_compaction(*engine_ref, tablet);
+    cu_compaction._input_rowsets = std::move(input_rowsets);
+    EXPECT_EQ(cu_compaction.handle_ordered_data_compaction(), true);
+
+    auto& out_rowset = cu_compaction._output_rowset;
+
+    // Verify the index_disk_size of the output rowset
+    int64_t expected_total_size = 0;
+    for (const auto& rowset : cu_compaction._input_rowsets) {
+        expected_total_size += rowset->rowset_meta()->total_disk_size();
+    }
+    std::cout << "expected_total_size: " << expected_total_size << std::endl;
+    std::cout << "actual_total_disk_size: " << 
out_rowset->rowset_meta()->total_disk_size()
+              << std::endl;
+    EXPECT_EQ(out_rowset->rowset_meta()->total_disk_size(), 
expected_total_size);
+}
 } // namespace vectorized
 } // namespace doris
diff --git a/be/test/testutil/mock_rowset.h b/be/test/testutil/mock_rowset.h
index 36ef64fd9ec..1d6638863df 100644
--- a/be/test/testutil/mock_rowset.h
+++ b/be/test/testutil/mock_rowset.h
@@ -50,6 +50,10 @@ class MockRowset : public Rowset {
         return Status::NotSupported("MockRowset not support this method.");
     }
 
+    Status get_inverted_index_size(int64_t* index_size) override {
+        return Status::NotSupported("MockRowset not support this method.");
+    }
+
     void clear_inverted_index_cache() override {}
 
     Status get_segments_key_bounds(std::vector<KeyBoundsPB>* 
segments_key_bounds) override {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to