This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new db8793db79b branch-3.0-pick: [fix](cloud) Adjust rowset state check in `CloudTablet::create_transient_rowset_writer` (#45644) db8793db79b is described below commit db8793db79b632c3a13b600a6cee0ed5d53e98a3 Author: bobhan1 <bao...@selectdb.com> AuthorDate: Thu Dec 19 23:57:56 2024 +0800 branch-3.0-pick: [fix](cloud) Adjust rowset state check in `CloudTablet::create_transient_rowset_writer` (#45644) pick https://github.com/apache/doris/pull/45496 --- be/src/cloud/cloud_tablet.cpp | 23 +++-- .../cloud/test_cloud_mow_partial_update_retry.out | 16 ++++ .../test_cloud_mow_partial_update_retry.groovy | 100 +++++++++++++++++++++ 3 files changed, 131 insertions(+), 8 deletions(-) diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index fe9494ef149..76cb9042af4 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -54,6 +54,7 @@ namespace doris { using namespace ErrorCode; static constexpr int COMPACTION_DELETE_BITMAP_LOCK_ID = -1; +static constexpr int LOAD_INITIATOR_ID = -1; CloudTablet::CloudTablet(CloudStorageEngine& engine, TabletMetaSharedPtr tablet_meta) : BaseTablet(std::move(tablet_meta)), _engine(engine) {} @@ -504,13 +505,19 @@ Result<std::unique_ptr<RowsetWriter>> CloudTablet::create_rowset_writer( Result<std::unique_ptr<RowsetWriter>> CloudTablet::create_transient_rowset_writer( const Rowset& rowset, std::shared_ptr<PartialUpdateInfo> partial_update_info, int64_t txn_expiration) { - if (rowset.rowset_meta()->rowset_state() != RowsetStatePB::BEGIN_PARTIAL_UPDATE) [[unlikely]] { - // May cause the segment files generated by the transient rowset writer unable to be - // recycled, see `CloudRowsetWriter::build` for detail. - LOG(WARNING) << "Wrong rowset state: " << rowset.rowset_meta()->rowset_state(); - DCHECK(false) << rowset.rowset_meta()->rowset_state(); + if (rowset.rowset_meta_state() != RowsetStatePB::BEGIN_PARTIAL_UPDATE && + rowset.rowset_meta_state() != RowsetStatePB::COMMITTED) [[unlikely]] { + auto msg = fmt::format( + "wrong rowset state when create_transient_rowset_writer, rowset state should be " + "BEGIN_PARTIAL_UPDATE or COMMITTED, but found {}, rowset_id={}, tablet_id={}", + RowsetStatePB_Name(rowset.rowset_meta_state()), rowset.rowset_id().to_string(), + tablet_id()); + // see `CloudRowsetWriter::build` for detail. + // if this is in a retry task, the rowset state may have been changed to RowsetStatePB::COMMITTED + // in `RowsetMeta::merge_rowset_meta()` in previous trials. + LOG(WARNING) << msg; + DCHECK(false) << msg; } - RowsetWriterContext context; context.rowset_state = PREPARED; context.segments_overlap = OVERLAPPING; @@ -717,8 +724,8 @@ Status CloudTablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t tx } } - RETURN_IF_ERROR(_engine.meta_mgr().update_delete_bitmap( - *this, txn_id, COMPACTION_DELETE_BITMAP_LOCK_ID, new_delete_bitmap.get())); + RETURN_IF_ERROR(_engine.meta_mgr().update_delete_bitmap(*this, txn_id, LOAD_INITIATOR_ID, + new_delete_bitmap.get())); // store the delete bitmap with sentinel marks in txn_delete_bitmap_cache because if the txn is retried for some reason, // it will use the delete bitmap from txn_delete_bitmap_cache when re-calculating the delete bitmap, during which it will do diff --git a/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.out b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.out new file mode 100644 index 00000000000..3b24419bdc6 --- /dev/null +++ b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.out @@ -0,0 +1,16 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 1 1 1 +2 2 2 2 +3 3 3 2 + +-- !sql -- +1 1 888 1 +2 2 777 2 +3 3 3 2 + +-- !sql -- +1 999 888 1 +2 666 777 2 +3 3 3 2 + diff --git a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.groovy b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.groovy new file mode 100644 index 00000000000..13abaf1ffca --- /dev/null +++ b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_partial_update_retry.groovy @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_cloud_mow_partial_update_retry", "nonConcurrent") { + if (!isCloudMode()) { + return + } + + GetDebugPoint().clearDebugPointsForAllFEs() + GetDebugPoint().clearDebugPointsForAllBEs() + + def customFeConfig = [ + delete_bitmap_lock_expiration_seconds : 10, + calculate_delete_bitmap_task_timeout_seconds : 15, + ] + + setFeConfigTemporary(customFeConfig) { + + def table1 = "test_cloud_mow_partial_update_retry" + sql "DROP TABLE IF EXISTS ${table1} FORCE;" + sql """ CREATE TABLE IF NOT EXISTS ${table1} ( + `k1` int NOT NULL, + `c1` int, + `c2` int, + `c3` int + )UNIQUE KEY(k1) + DISTRIBUTED BY HASH(k1) BUCKETS 1 + PROPERTIES ( + "enable_unique_key_merge_on_write" = "true", + "disable_auto_compaction" = "true", + "replication_num" = "1"); """ + + sql "insert into ${table1} values(1,1,1,1);" + sql "insert into ${table1} values(2,2,2,2);" + sql "insert into ${table1} values(3,3,3,2);" + sql "sync;" + qt_sql "select * from ${table1} order by k1;" + + try { + // block the first load + GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.enable_spin_wait", [token: "token1"]) + GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.block", [wait_token: "token1"]) + + // the first load + t1 = Thread.start { + sql "set enable_unique_key_partial_update=true;" + sql "sync;" + sql "insert into ${table1}(k1,c1) values(1,999),(2,666);" + } + + // wait util the first partial update load's delete bitmap update lock expired + // to ensure that the second load can take the delete bitmap update lock + // Config.delete_bitmap_lock_expiration_seconds = 10s + Thread.sleep(11 * 1000) + + // the second load + GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.enable_spin_wait", [token: "token2"]) + Thread.sleep(200) + + sql "set enable_unique_key_partial_update=true;" + sql "sync;" + sql "insert into ${table1}(k1,c2) values(1,888),(2,777);" + + qt_sql "select * from ${table1} order by k1;" + + + // keep waiting util the delete bitmap calculation timeout(Config.calculate_delete_bitmap_task_timeout_seconds = 15s) + // and the first load will retry the calculation of delete bitmap + Thread.sleep(15 * 1000) + + // let the first partial update load finish + GetDebugPoint().enableDebugPointForAllBEs("BaseTablet::update_delete_bitmap.block") + t1.join() + + Thread.sleep(1000) + + qt_sql "select * from ${table1} order by k1;" + + } catch(Exception e) { + logger.info(e.getMessage()) + throw e + } finally { + GetDebugPoint().clearDebugPointsForAllBEs() + } + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org