This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 25c864dfea3 [fix](cloud) Fix balanced_tablets_shards leak memory and
fix some file cache case (#59093)
25c864dfea3 is described below
commit 25c864dfea3cc71809bdc04a83dd783c78671a38
Author: deardeng <[email protected]>
AuthorDate: Tue Dec 23 02:33:59 2025 +0800
[fix](cloud) Fix balanced_tablets_shards leak memory and fix some file
cache case (#59093)
### What problem does this PR solve?
1. When fixing the issue of CloudWarmUpManager leaking a small amount of
memory and failing to remove (tablet_id, peer_addr) from memory during
warm-up balance when file cache meta is not retrieved from source, the
issue was resolved.
2. Fixed several file cache Docker cases affected by small file
optimization PR(https://github.com/apache/doris/pull/57770)
Issue Number: close #xxx
Related PR: #xxx
Problem Summary:
### Release note
None
### Check List (For Author)
- Test <!-- At least one of them must be included. -->
- [ ] Regression test
- [ ] Unit Test
- [ ] Manual test (add detailed scripts or steps below)
- [ ] No need to test or manual test. Explain why:
- [ ] This is a refactor/code format and no logic has been changed.
- [x] Previous test can cover this change.
- [ ] No code files have been changed.
- [ ] Other reason <!-- Add your reason? -->
- Behavior changed:
- [x] No.
- [ ] Yes. <!-- Explain the behavior change -->
- Does this need documentation?
- [x] No.
- [ ] Yes. <!-- Add document PR link here. eg:
https://github.com/apache/doris-website/pull/1214 -->
### Check List (For Reviewer who merge this PR)
- [ ] Confirm the release note
- [ ] Confirm test cases
- [ ] Confirm document
- [ ] Add branch pick label <!-- Add branch pick label that this PR
should merge into -->
---
be/src/cloud/cloud_warm_up_manager.cpp | 31 ++++++++++++++++++++++
be/src/cloud/cloud_warm_up_manager.h | 2 ++
be/src/io/cache/block_file_cache_downloader.cpp | 29 +-------------------
...est_balance_use_compute_group_properties.groovy | 3 ++-
.../cloud_p0/balance/test_balance_warm_up.groovy | 3 ++-
.../test_balance_warm_up_sync_global_config.groovy | 3 ++-
.../test_balance_warm_up_task_abnormal.groovy | 3 ++-
.../test_balance_warm_up_use_peer_cache.groovy | 3 ++-
...e_warm_up_with_compaction_use_peer_cache.groovy | 3 ++-
.../balance/test_peer_read_async_warmup.groovy | 1 +
.../tablets/test_clean_stale_rs_file_cache.groovy | 3 ++-
.../test_clean_stale_rs_index_file_cache.groovy | 3 ++-
.../test_clean_tablet_when_drop_force_table.groovy | 3 ++-
.../test_clean_tablet_when_rebalance.groovy | 3 ++-
14 files changed, 55 insertions(+), 38 deletions(-)
diff --git a/be/src/cloud/cloud_warm_up_manager.cpp
b/be/src/cloud/cloud_warm_up_manager.cpp
index 2e00592d0a3..272b690facf 100644
--- a/be/src/cloud/cloud_warm_up_manager.cpp
+++ b/be/src/cloud/cloud_warm_up_manager.cpp
@@ -19,6 +19,8 @@
#include <bthread/condition_variable.h>
#include <bthread/mutex.h>
+#include <bthread/unstable.h>
+#include <butil/time.h>
#include <bvar/bvar.h>
#include <bvar/reducer.h>
@@ -795,10 +797,39 @@ void CloudWarmUpManager::record_balanced_tablet(int64_t
tablet_id, const std::st
meta.brpc_port = brpc_port;
shard.tablets.emplace(tablet_id, std::move(meta));
g_balance_tablet_be_mapping_size << 1;
+ schedule_remove_balanced_tablet(tablet_id);
VLOG_DEBUG << "Recorded balanced warm up cache tablet: tablet_id=" <<
tablet_id
<< ", host=" << host << ":" << brpc_port;
}
+void CloudWarmUpManager::schedule_remove_balanced_tablet(int64_t tablet_id) {
+ // Use std::make_unique to avoid raw pointer allocation
+ auto tablet_id_ptr = std::make_unique<int64_t>(tablet_id);
+ unsigned long expired_ms = g_tablet_report_inactive_duration_ms;
+ if (doris::config::cache_read_from_peer_expired_seconds > 0 &&
+ doris::config::cache_read_from_peer_expired_seconds <=
+ g_tablet_report_inactive_duration_ms / 1000) {
+ expired_ms = doris::config::cache_read_from_peer_expired_seconds *
1000;
+ }
+ bthread_timer_t timer_id;
+ // ATTN: The timer callback will reclaim ownership of the tablet_id_ptr,
so we need to release it after the timer is added.
+ if (const int rc = bthread_timer_add(&timer_id,
butil::milliseconds_from_now(expired_ms),
+ clean_up_expired_mappings,
tablet_id_ptr.get());
+ rc == 0) {
+ tablet_id_ptr.release();
+ } else {
+ LOG(WARNING) << "Fail to add timer for clean up expired mappings for
tablet_id="
+ << tablet_id << " rc=" << rc;
+ }
+}
+
+void CloudWarmUpManager::clean_up_expired_mappings(void* arg) {
+ std::unique_ptr<int64_t> tid(static_cast<int64_t*>(arg));
+ auto& manager =
ExecEnv::GetInstance()->storage_engine().to_cloud().cloud_warm_up_manager();
+ manager.remove_balanced_tablet(*tid);
+ VLOG_DEBUG << "Removed expired balanced warm up cache tablet: tablet_id="
<< *tid;
+}
+
std::optional<std::pair<std::string, int32_t>>
CloudWarmUpManager::get_balanced_tablet_info(
int64_t tablet_id) {
auto& shard = get_shard(tablet_id);
diff --git a/be/src/cloud/cloud_warm_up_manager.h
b/be/src/cloud/cloud_warm_up_manager.h
index 8725dc06939..f5b00394a27 100644
--- a/be/src/cloud/cloud_warm_up_manager.h
+++ b/be/src/cloud/cloud_warm_up_manager.h
@@ -98,6 +98,8 @@ public:
std::unordered_map<int64_t, std::pair<std::string, int32_t>>
get_all_balanced_tablets() const;
private:
+ void schedule_remove_balanced_tablet(int64_t tablet_id);
+ static void clean_up_expired_mappings(void* arg);
void handle_jobs();
Status _do_warm_up_rowset(RowsetMeta& rs_meta, std::vector<TReplicaInfo>&
replicas,
diff --git a/be/src/io/cache/block_file_cache_downloader.cpp
b/be/src/io/cache/block_file_cache_downloader.cpp
index 27f046140b3..5113c7f8b3e 100644
--- a/be/src/io/cache/block_file_cache_downloader.cpp
+++ b/be/src/io/cache/block_file_cache_downloader.cpp
@@ -173,14 +173,6 @@ std::unordered_map<std::string, RowsetMetaSharedPtr>
snapshot_rs_metas(BaseTable
return id_to_rowset_meta_map;
}
-static void clean_up_expired_mappings(void* arg) {
- // Reclaim ownership with unique_ptr for automatic memory management
- std::unique_ptr<int64_t> tablet_id(static_cast<int64_t*>(arg));
- auto& manager =
ExecEnv::GetInstance()->storage_engine().to_cloud().cloud_warm_up_manager();
- manager.remove_balanced_tablet(*tablet_id);
- VLOG_DEBUG << "Removed expired balanced warm up cache tablet: tablet_id="
<< *tablet_id;
-}
-
void FileCacheBlockDownloader::download_file_cache_block(
const DownloadTask::FileCacheBlockMetaVec& metas) {
std::unordered_set<int64_t> synced_tablets;
@@ -236,27 +228,8 @@ void FileCacheBlockDownloader::download_file_cache_block(
<< "]";
}
}
- // Use std::make_unique to avoid raw pointer allocation
- auto tablet_id_ptr = std::make_unique<int64_t>(tablet_id);
- unsigned long expired_ms = g_tablet_report_inactive_duration_ms;
- if (doris::config::cache_read_from_peer_expired_seconds > 0 &&
- doris::config::cache_read_from_peer_expired_seconds <=
- g_tablet_report_inactive_duration_ms / 1000) {
- expired_ms =
doris::config::cache_read_from_peer_expired_seconds * 1000;
- }
- bthread_timer_t timer_id;
- // ATTN: The timer callback will reclaim ownership of the
tablet_id_ptr, so we need to release it after the timer is added.
- if (const int rc =
- bthread_timer_add(&timer_id,
butil::milliseconds_from_now(expired_ms),
- clean_up_expired_mappings,
tablet_id_ptr.get());
- rc == 0) {
- tablet_id_ptr.release();
- } else {
- LOG(WARNING) << "Fail to add timer for clean up expired
mappings for tablet_id="
- << tablet_id << " rc=" << rc;
- }
LOG(INFO) << "download_file_cache_block: download_done,
tablet_Id=" << tablet_id
- << " status=" << st.to_string() << " expired_ms=" <<
expired_ms;
+ << " status=" << st.to_string();
};
std::string path;
diff --git
a/regression-test/suites/cloud_p0/balance/test_balance_use_compute_group_properties.groovy
b/regression-test/suites/cloud_p0/balance/test_balance_use_compute_group_properties.groovy
index f96119ec29b..de54b93f2d8 100644
---
a/regression-test/suites/cloud_p0/balance/test_balance_use_compute_group_properties.groovy
+++
b/regression-test/suites/cloud_p0/balance/test_balance_use_compute_group_properties.groovy
@@ -36,7 +36,8 @@ suite('test_balance_use_compute_group_properties', 'docker') {
'report_tablet_interval_seconds=1',
'schedule_sync_tablets_interval_s=18000',
'disable_auto_compaction=true',
- 'sys_log_verbose_modules=*'
+ 'sys_log_verbose_modules=*',
+ 'enable_packed_file=false',
]
options.setFeNum(1)
options.setBeNum(1)
diff --git
a/regression-test/suites/cloud_p0/balance/test_balance_warm_up.groovy
b/regression-test/suites/cloud_p0/balance/test_balance_warm_up.groovy
index db28efafad8..037d54e1d6e 100644
--- a/regression-test/suites/cloud_p0/balance/test_balance_warm_up.groovy
+++ b/regression-test/suites/cloud_p0/balance/test_balance_warm_up.groovy
@@ -37,7 +37,8 @@ suite('test_balance_warm_up', 'docker') {
'disable_auto_compaction=true',
'sys_log_verbose_modules=*',
'cache_read_from_peer_expired_seconds=100',
- 'enable_cache_read_from_peer=true'
+ 'enable_cache_read_from_peer=true',
+ 'enable_packed_file=false',
]
options.setFeNum(1)
options.setBeNum(1)
diff --git
a/regression-test/suites/cloud_p0/balance/test_balance_warm_up_sync_global_config.groovy
b/regression-test/suites/cloud_p0/balance/test_balance_warm_up_sync_global_config.groovy
index 19c252fba85..f2af4b086a5 100644
---
a/regression-test/suites/cloud_p0/balance/test_balance_warm_up_sync_global_config.groovy
+++
b/regression-test/suites/cloud_p0/balance/test_balance_warm_up_sync_global_config.groovy
@@ -36,7 +36,8 @@ suite('test_balance_warm_up_sync_cache', 'docker') {
'report_tablet_interval_seconds=1',
'schedule_sync_tablets_interval_s=18000',
'disable_auto_compaction=true',
- 'sys_log_verbose_modules=*'
+ 'sys_log_verbose_modules=*',
+ 'enable_packed_file=false',
]
options.setFeNum(1)
options.setBeNum(1)
diff --git
a/regression-test/suites/cloud_p0/balance/test_balance_warm_up_task_abnormal.groovy
b/regression-test/suites/cloud_p0/balance/test_balance_warm_up_task_abnormal.groovy
index 0938126d61c..fe5dc84ef67 100644
---
a/regression-test/suites/cloud_p0/balance/test_balance_warm_up_task_abnormal.groovy
+++
b/regression-test/suites/cloud_p0/balance/test_balance_warm_up_task_abnormal.groovy
@@ -36,7 +36,8 @@ suite('test_balance_warm_up_task_abnormal', 'docker') {
'report_tablet_interval_seconds=1',
'schedule_sync_tablets_interval_s=18000',
'disable_auto_compaction=true',
- 'sys_log_verbose_modules=*'
+ 'sys_log_verbose_modules=*',
+ 'enable_packed_file=false',
]
options.setFeNum(1)
options.setBeNum(1)
diff --git
a/regression-test/suites/cloud_p0/balance/test_balance_warm_up_use_peer_cache.groovy
b/regression-test/suites/cloud_p0/balance/test_balance_warm_up_use_peer_cache.groovy
index f9202e702ed..4eeaac1b5e6 100644
---
a/regression-test/suites/cloud_p0/balance/test_balance_warm_up_use_peer_cache.groovy
+++
b/regression-test/suites/cloud_p0/balance/test_balance_warm_up_use_peer_cache.groovy
@@ -40,7 +40,8 @@ suite('test_balance_warm_up_use_peer_cache', 'docker') {
'disable_auto_compaction=true',
'sys_log_verbose_modules=*',
'cache_read_from_peer_expired_seconds=100',
- 'enable_cache_read_from_peer=true'
+ 'enable_cache_read_from_peer=true',
+ 'enable_packed_file=false',
]
options.setFeNum(1)
options.setBeNum(1)
diff --git
a/regression-test/suites/cloud_p0/balance/test_balance_warm_up_with_compaction_use_peer_cache.groovy
b/regression-test/suites/cloud_p0/balance/test_balance_warm_up_with_compaction_use_peer_cache.groovy
index be120221019..548cf1c3fc7 100644
---
a/regression-test/suites/cloud_p0/balance/test_balance_warm_up_with_compaction_use_peer_cache.groovy
+++
b/regression-test/suites/cloud_p0/balance/test_balance_warm_up_with_compaction_use_peer_cache.groovy
@@ -41,7 +41,8 @@ suite('test_balance_warm_up_with_compaction_use_peer_cache',
'docker') {
'sys_log_verbose_modules=*',
'cumulative_compaction_min_deltas=5',
'cache_read_from_peer_expired_seconds=100',
- 'enable_cache_read_from_peer=true'
+ 'enable_cache_read_from_peer=true',
+ 'enable_packed_file=false',
]
options.setFeNum(1)
options.setBeNum(1)
diff --git
a/regression-test/suites/cloud_p0/balance/test_peer_read_async_warmup.groovy
b/regression-test/suites/cloud_p0/balance/test_peer_read_async_warmup.groovy
index ba9a61409bb..8a3469ad8e1 100644
--- a/regression-test/suites/cloud_p0/balance/test_peer_read_async_warmup.groovy
+++ b/regression-test/suites/cloud_p0/balance/test_peer_read_async_warmup.groovy
@@ -39,6 +39,7 @@ suite('test_peer_read_async_warmup', 'docker') {
'disable_auto_compaction=true',
'sys_log_verbose_modules=*',
'enable_cache_read_from_peer=true',
+ 'enable_packed_file=false',
]
options.setFeNum(1)
options.setBeNum(1)
diff --git
a/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy
b/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy
index 807e51ae95f..e161b44ce0f 100644
---
a/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy
+++
b/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_file_cache.groovy
@@ -34,7 +34,8 @@ suite('test_clean_stale_rs_file_cache', 'docker') {
'cumulative_compaction_min_deltas=5',
'tablet_rowset_stale_sweep_by_size=false',
'tablet_rowset_stale_sweep_time_sec=60',
- 'vacuum_stale_rowsets_interval_s=10'
+ 'vacuum_stale_rowsets_interval_s=10',
+ 'enable_packed_file=false',
]
options.setFeNum(1)
options.setBeNum(1)
diff --git
a/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_index_file_cache.groovy
b/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_index_file_cache.groovy
index e99a5f96b36..59e13b1030b 100644
---
a/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_index_file_cache.groovy
+++
b/regression-test/suites/cloud_p0/tablets/test_clean_stale_rs_index_file_cache.groovy
@@ -34,7 +34,8 @@ suite('test_clean_stale_rs_index_file_cache', 'docker') {
'cumulative_compaction_min_deltas=5',
'tablet_rowset_stale_sweep_by_size=false',
'tablet_rowset_stale_sweep_time_sec=60',
- 'vacuum_stale_rowsets_interval_s=10'
+ 'vacuum_stale_rowsets_interval_s=10',
+ 'enable_packed_file=false',
]
options.setFeNum(1)
options.setBeNum(1)
diff --git
a/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_drop_force_table.groovy
b/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_drop_force_table.groovy
index ff8753a857e..b0e55bd8aef 100644
---
a/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_drop_force_table.groovy
+++
b/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_drop_force_table.groovy
@@ -34,7 +34,8 @@ suite('test_clean_tablet_when_drop_force_table', 'docker') {
'report_tablet_interval_seconds=1',
'write_buffer_size=10240',
'write_buffer_size_for_agg=10240',
- 'sys_log_verbose_modules=task_worker_pool'
+ 'sys_log_verbose_modules=task_worker_pool',
+ 'enable_packed_file=false',
]
options.setFeNum(3)
options.setBeNum(3)
diff --git
a/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_rebalance.groovy
b/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_rebalance.groovy
index 75c2c410a23..964ba517bb7 100644
---
a/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_rebalance.groovy
+++
b/regression-test/suites/cloud_p0/tablets/test_clean_tablet_when_rebalance.groovy
@@ -34,7 +34,8 @@ suite('test_clean_tablet_when_rebalance', 'docker') {
options.beConfigs += [
'report_tablet_interval_seconds=1',
'write_buffer_size=10240',
- 'write_buffer_size_for_agg=10240'
+ 'write_buffer_size_for_agg=10240',
+ 'enable_packed_file=false',
]
options.setFeNum(3)
options.setBeNum(3)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]