This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new db8793db79b branch-3.0-pick: [fix](cloud) Adjust rowset state check in 
`CloudTablet::create_transient_rowset_writer` (#45644)
db8793db79b is described below

commit db8793db79b632c3a13b600a6cee0ed5d53e98a3
Author: bobhan1 <bao...@selectdb.com>
AuthorDate: Thu Dec 19 23:57:56 2024 +0800

    branch-3.0-pick: [fix](cloud) Adjust rowset state check in 
`CloudTablet::create_transient_rowset_writer` (#45644)
    
    pick https://github.com/apache/doris/pull/45496
---
 be/src/cloud/cloud_tablet.cpp                      |  23 +++--
 .../cloud/test_cloud_mow_partial_update_retry.out  |  16 ++++
 .../test_cloud_mow_partial_update_retry.groovy     | 100 +++++++++++++++++++++
 3 files changed, 131 insertions(+), 8 deletions(-)

diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp
index fe9494ef149..76cb9042af4 100644
--- a/be/src/cloud/cloud_tablet.cpp
+++ b/be/src/cloud/cloud_tablet.cpp
@@ -54,6 +54,7 @@ namespace doris {
 using namespace ErrorCode;
 
 static constexpr int COMPACTION_DELETE_BITMAP_LOCK_ID = -1;
+static constexpr int LOAD_INITIATOR_ID = -1;
 
 CloudTablet::CloudTablet(CloudStorageEngine& engine, TabletMetaSharedPtr 
tablet_meta)
         : BaseTablet(std::move(tablet_meta)), _engine(engine) {}
@@ -504,13 +505,19 @@ Result<std::unique_ptr<RowsetWriter>> 
CloudTablet::create_rowset_writer(
 Result<std::unique_ptr<RowsetWriter>> 
CloudTablet::create_transient_rowset_writer(
         const Rowset& rowset, std::shared_ptr<PartialUpdateInfo> 
partial_update_info,
         int64_t txn_expiration) {
-    if (rowset.rowset_meta()->rowset_state() != 
RowsetStatePB::BEGIN_PARTIAL_UPDATE) [[unlikely]] {
-        // May cause the segment files generated by the transient rowset 
writer unable to be
-        // recycled, see `CloudRowsetWriter::build` for detail.
-        LOG(WARNING) << "Wrong rowset state: " << 
rowset.rowset_meta()->rowset_state();
-        DCHECK(false) << rowset.rowset_meta()->rowset_state();
+    if (rowset.rowset_meta_state() != RowsetStatePB::BEGIN_PARTIAL_UPDATE &&
+        rowset.rowset_meta_state() != RowsetStatePB::COMMITTED) [[unlikely]] {
+        auto msg = fmt::format(
+                "wrong rowset state when create_transient_rowset_writer, 
rowset state should be "
+                "BEGIN_PARTIAL_UPDATE or COMMITTED, but found {}, 
rowset_id={}, tablet_id={}",
+                RowsetStatePB_Name(rowset.rowset_meta_state()), 
rowset.rowset_id().to_string(),
+                tablet_id());
+        // see `CloudRowsetWriter::build` for detail.
+        // if this is in a retry task, the rowset state may have been changed 
to RowsetStatePB::COMMITTED
+        // in `RowsetMeta::merge_rowset_meta()` in previous trials.
+        LOG(WARNING) << msg;
+        DCHECK(false) << msg;
     }
-
     RowsetWriterContext context;
     context.rowset_state = PREPARED;
     context.segments_overlap = OVERLAPPING;
@@ -717,8 +724,8 @@ Status CloudTablet::save_delete_bitmap(const TabletTxnInfo* 
txn_info, int64_t tx
         }
     }
 
-    RETURN_IF_ERROR(_engine.meta_mgr().update_delete_bitmap(
-            *this, txn_id, COMPACTION_DELETE_BITMAP_LOCK_ID, 
new_delete_bitmap.get()));
+    RETURN_IF_ERROR(_engine.meta_mgr().update_delete_bitmap(*this, txn_id, 
LOAD_INITIATOR_ID,
+                                                            
new_delete_bitmap.get()));
 
     // store the delete bitmap with sentinel marks in txn_delete_bitmap_cache 
because if the txn is retried for some reason,
     // it will use the delete bitmap from txn_delete_bitmap_cache when 
re-calculating the delete bitmap, during which it will do
diff --git 
a/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.out
 
b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.out
new file mode 100644
index 00000000000..3b24419bdc6
--- /dev/null
+++ 
b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.out
@@ -0,0 +1,16 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+1      1       1       1
+2      2       2       2
+3      3       3       2
+
+-- !sql --
+1      1       888     1
+2      2       777     2
+3      3       3       2
+
+-- !sql --
+1      999     888     1
+2      666     777     2
+3      3       3       2
+
diff --git 
a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.groovy
 
b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.groovy
new file mode 100644
index 00000000000..13abaf1ffca
--- /dev/null
+++ 
b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.groovy
@@ -0,0 +1,100 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_cloud_mow_partial_update_retry", "nonConcurrent") {
+    if (!isCloudMode()) {
+        return
+    }
+
+    GetDebugPoint().clearDebugPointsForAllFEs()
+    GetDebugPoint().clearDebugPointsForAllBEs()
+
+    def customFeConfig = [
+        delete_bitmap_lock_expiration_seconds : 10,
+        calculate_delete_bitmap_task_timeout_seconds : 15,
+    ]
+
+    setFeConfigTemporary(customFeConfig) {
+
+        def table1 = "test_cloud_mow_partial_update_retry"
+        sql "DROP TABLE IF EXISTS ${table1} FORCE;"
+        sql """ CREATE TABLE IF NOT EXISTS ${table1} (
+                `k1` int NOT NULL,
+                `c1` int,
+                `c2` int,
+                `c3` int
+                )UNIQUE KEY(k1)
+            DISTRIBUTED BY HASH(k1) BUCKETS 1
+            PROPERTIES (
+                "enable_unique_key_merge_on_write" = "true",
+                "disable_auto_compaction" = "true",
+                "replication_num" = "1"); """
+
+        sql "insert into ${table1} values(1,1,1,1);"
+        sql "insert into ${table1} values(2,2,2,2);"
+        sql "insert into ${table1} values(3,3,3,2);"
+        sql "sync;"
+        qt_sql "select * from ${table1} order by k1;"
+
+        try {
+            // block the first load
+            
GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.enable_spin_wait",
 [token: "token1"])
+            
GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.block",
 [wait_token: "token1"])
+
+            // the first load
+            t1 = Thread.start {
+                sql "set enable_unique_key_partial_update=true;"
+                sql "sync;"
+                sql "insert into ${table1}(k1,c1) values(1,999),(2,666);"
+            }
+
+            // wait util the first partial update load's delete bitmap update 
lock expired
+            // to ensure that the second load can take the delete bitmap 
update lock
+            // Config.delete_bitmap_lock_expiration_seconds = 10s
+            Thread.sleep(11 * 1000)
+
+            // the second load
+            
GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.enable_spin_wait",
 [token: "token2"])
+            Thread.sleep(200)
+
+            sql "set enable_unique_key_partial_update=true;"
+            sql "sync;"
+            sql "insert into ${table1}(k1,c2) values(1,888),(2,777);"
+
+            qt_sql "select * from ${table1} order by k1;"
+
+
+            // keep waiting util the delete bitmap calculation 
timeout(Config.calculate_delete_bitmap_task_timeout_seconds = 15s)
+            // and the first load will retry the calculation of delete bitmap
+            Thread.sleep(15 * 1000)
+
+            // let the first partial update load finish
+            
GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.block")
+            t1.join()
+
+            Thread.sleep(1000)
+
+            qt_sql "select * from ${table1} order by k1;"
+            
+        } catch(Exception e) {
+            logger.info(e.getMessage())
+            throw e
+        } finally {
+            GetDebugPoint().clearDebugPointsForAllBEs()
+        }
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to