This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 2abbc7898a3 [Opt](cloud) Add inject points for cloud mow (#48190) 2abbc7898a3 is described below commit 2abbc7898a3d2f06b89ba783afd875239a6558bd Author: bobhan1 <bao...@selectdb.com> AuthorDate: Tue Feb 25 22:07:38 2025 +0800 [Opt](cloud) Add inject points for cloud mow (#48190) --- be/src/cloud/cloud_meta_mgr.cpp | 11 ++++ be/src/cloud/cloud_schema_change_job.cpp | 16 +++++ .../cloud/test_cloud_mow_correctness_inject.out | Bin 185 -> 368 bytes .../cloud/test_cloud_mow_correctness_inject.groovy | 71 ++++++++++++++++++++- 4 files changed, 95 insertions(+), 3 deletions(-) diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp index afac47e3645..41e60b5e264 100644 --- a/be/src/cloud/cloud_meta_mgr.cpp +++ b/be/src/cloud/cloud_meta_mgr.cpp @@ -1183,6 +1183,17 @@ Status CloudMetaMgr::cloud_update_delete_bitmap_without_lock(const CloudTablet& Status CloudMetaMgr::get_delete_bitmap_update_lock(const CloudTablet& tablet, int64_t lock_id, int64_t initiator) { + DBUG_EXECUTE_IF("get_delete_bitmap_update_lock.inject_fail", { + auto p = dp->param("percent", 0.01); + std::mt19937 gen {std::random_device {}()}; + std::bernoulli_distribution inject_fault {p}; + if (inject_fault(gen)) { + return Status::Error<ErrorCode::DELETE_BITMAP_LOCK_ERROR>( + "injection error when get get_delete_bitmap_update_lock, " + "tablet_id={}, lock_id={}, initiator={}", + tablet.tablet_id(), lock_id, initiator); + } + }); VLOG_DEBUG << "get_delete_bitmap_update_lock , tablet_id: " << tablet.tablet_id() << ",lock_id:" << lock_id; GetDeleteBitmapUpdateLockRequest req; diff --git a/be/src/cloud/cloud_schema_change_job.cpp b/be/src/cloud/cloud_schema_change_job.cpp index d12bcdaa01e..7c584d999bf 100644 --- a/be/src/cloud/cloud_schema_change_job.cpp +++ b/be/src/cloud/cloud_schema_change_job.cpp @@ -21,6 +21,7 @@ #include <chrono> #include <memory> +#include <random> #include <thread> #include "cloud/cloud_meta_mgr.h" @@ -463,6 +464,9 @@ Status CloudSchemaChangeJob::_process_delete_bitmap(int64_t alter_version, } } + DBUG_EXECUTE_IF("CloudSchemaChangeJob::_process_delete_bitmap.before_new_inc.block", + DBUG_BLOCK); + // step 2, process incremental rowset with delete bitmap update lock RETURN_IF_ERROR(_cloud_storage_engine.meta_mgr().get_delete_bitmap_update_lock( *_new_tablet, SCHEMA_CHANGE_DELETE_BITMAP_LOCK_ID, initiator)); @@ -484,6 +488,18 @@ Status CloudSchemaChangeJob::_process_delete_bitmap(int64_t alter_version, } } + DBUG_EXECUTE_IF("CloudSchemaChangeJob::_process_delete_bitmap.inject_sleep", { + auto p = dp->param("percent", 0.01); + auto sleep_time = dp->param("sleep", 100); + std::mt19937 gen {std::random_device {}()}; + std::bernoulli_distribution inject_fault {p}; + if (inject_fault(gen)) { + LOG_INFO("injection sleep for {} seconds, tablet_id={}, sc job_id={}", sleep_time, + _new_tablet->tablet_id(), _job_id); + std::this_thread::sleep_for(std::chrono::seconds(sleep_time)); + } + }); + auto& delete_bitmap = tmp_tablet->tablet_meta()->delete_bitmap(); // step4, store delete bitmap diff --git a/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.out b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.out index 79839efff32..57619853130 100644 Binary files a/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.out and b/regression-test/data/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.out differ diff --git a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.groovy b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.groovy index 3c6ce3e8294..fa447e131d9 100644 --- a/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.groovy +++ b/regression-test/suites/fault_injection_p0/cloud/test_cloud_mow_correctness_inject.groovy @@ -39,8 +39,7 @@ suite("test_cloud_mow_correctness_inject", "nonConcurrent") { PROPERTIES ( "enable_mow_light_delete" = "false", "enable_unique_key_merge_on_write" = "true", - "disable_auto_compaction" = "true", - "replication_num" = "1"); """ + "disable_auto_compaction" = "true"); """ sql "insert into ${table1} values(1,1,1);" sql "insert into ${table1} values(2,2,2);" @@ -48,10 +47,22 @@ suite("test_cloud_mow_correctness_inject", "nonConcurrent") { sql "sync;" qt_sql "select * from ${table1} order by k1;" + def waitForSC = { + Awaitility.await().atMost(30, TimeUnit.SECONDS).pollDelay(100, TimeUnit.MILLISECONDS).pollInterval(1000, TimeUnit.MILLISECONDS).until(() -> { + def res = sql_return_maparray "SHOW ALTER TABLE COLUMN WHERE TableName='${table1}' ORDER BY createtime DESC LIMIT 1" + assert res.size() == 1 + if (res[0].State == "FINISHED" || res[0].State == "CANCELLED") { + return true; + } + return false; + }); + } + def customFeConfig = [ delete_bitmap_lock_expiration_seconds : 10, calculate_delete_bitmap_task_timeout_seconds : 2, - mow_calculate_delete_bitmap_retry_times : 3 + mow_calculate_delete_bitmap_retry_times : 3, + enable_schema_change_retry_in_cloud_mode : false // turn off to shorten the test's time consumption ] setFeConfigTemporary(customFeConfig) { @@ -90,5 +101,59 @@ suite("test_cloud_mow_correctness_inject", "nonConcurrent") { GetDebugPoint().clearDebugPointsForAllBEs() } + + try { + GetDebugPoint().enableDebugPointForAllBEs("get_delete_bitmap_update_lock.inject_fail", [percent: "1.0"]) + GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob.process_alter_tablet.sleep") + sql "alter table ${table1} modify column c2 varchar(100);" + Thread.sleep(1000) + sql "insert into ${table1} values(10,10,10);" + qt_sql "select * from ${table1} order by k1;" + Thread.sleep(200) + GetDebugPoint().disableDebugPointForAllBEs("CloudSchemaChangeJob.process_alter_tablet.sleep") + + waitForSC() + + def res = sql_return_maparray "SHOW ALTER TABLE COLUMN WHERE TableName='${table1}' ORDER BY createtime DESC LIMIT 1" + assert res[0].State == "CANCELLED" + assert res[0].Msg.contains("injection error when get get_delete_bitmap_update_lock") + + qt_sql "select * from ${table1} order by k1;" + } catch(Exception e) { + logger.info(e.getMessage()) + throw e + } finally { + GetDebugPoint().clearDebugPointsForAllBEs() + } + + + try { + // sleep enough time to let sc's delete bitmap lock expired + GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob::_process_delete_bitmap.inject_sleep", [percent: "1.0", sleep: "20"]) + GetDebugPoint().enableDebugPointForAllBEs("CloudSchemaChangeJob::_process_delete_bitmap.before_new_inc.block") + sql "alter table ${table1} modify column c2 varchar(100);" + Thread.sleep(3000) + sql "insert into ${table1} values(11,11,11);" + qt_sql "select * from ${table1} order by k1;" + Thread.sleep(1000) + GetDebugPoint().disableDebugPointForAllBEs("CloudSchemaChangeJob::_process_delete_bitmap.before_new_inc.block") + + // wait until sc's delete bitmap expired + Thread.sleep(10000) + sql "insert into ${table1} values(12,12,12);" + + waitForSC() + + def res = sql_return_maparray "SHOW ALTER TABLE COLUMN WHERE TableName='${table1}' ORDER BY createtime DESC LIMIT 1" + assert res[0].State == "CANCELLED" + assert res[0].Msg.contains("[DELETE_BITMAP_LOCK_ERROR]lock expired when update delete bitmap") + + qt_sql "select * from ${table1} order by k1;" + } catch(Exception e) { + logger.info(e.getMessage()) + throw e + } finally { + GetDebugPoint().clearDebugPointsForAllBEs() + } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org