github-actions[bot] commented on code in PR #64496:
URL: https://github.com/apache/doris/pull/64496#discussion_r3412418220
##########
be/src/io/cache/block_file_cache.cpp:
##########
@@ -2189,6 +2140,154 @@ void BlockFileCache::run_background_ttl_gc() {
}
}
+size_t BlockFileCache::repair_duplicate_ttl_dirs_once() {
+ if (!config::enable_file_cache_ttl_repair_checker ||
+ _storage->get_type() != FileCacheStorageType::DISK ||
!_async_open_done) {
+ return 0;
+ }
+
+ const int64_t max_scan_prefix_dirs = std::max<int64_t>(
+ 0,
config::file_cache_ttl_repair_checker_max_scan_prefix_dirs_per_round);
+ if (max_scan_prefix_dirs == 0) {
+ return 0;
+ }
+
+ int64_t duration_ns = 0;
+ std::vector<FileCacheDuplicateKeyDirs> duplicate_dirs;
+ size_t next_prefix_index = _ttl_repair_checker_next_prefix_index;
+ size_t scanned_prefix_dirs = 0;
+ {
+ SCOPED_RAW_TIMER(&duration_ns);
+ auto st =
_storage->list_duplicate_key_dirs(_ttl_repair_checker_next_prefix_index,
+ max_scan_prefix_dirs,
&duplicate_dirs,
+ &next_prefix_index,
&scanned_prefix_dirs);
+ if (!st.ok()) {
+ LOG(WARNING) << "Failed to scan duplicate ttl dirs for file cache
" << _cache_base_path
+ << ", error=" << st;
+ *_ttl_repair_checker_latency_us << (duration_ns / 1000);
+ return 0;
+ }
+ _ttl_repair_checker_next_prefix_index = next_prefix_index;
+ }
+ *_ttl_repair_checker_scanned_prefix_dirs << scanned_prefix_dirs;
+ *_ttl_repair_checker_suspect_hashes << duplicate_dirs.size();
+
+ const int64_t max_repairs =
+ std::max<int64_t>(0,
config::file_cache_ttl_repair_checker_max_repairs_per_round);
+ const int64_t repair_sleep_ms =
+ std::max<int64_t>(0,
config::file_cache_ttl_repair_checker_repair_sleep_ms);
+ if (max_repairs == 0) {
+ *_ttl_repair_checker_latency_us << (duration_ns / 1000);
+ return 0;
+ }
+
+ const auto sleep_per_repair = std::chrono::milliseconds(repair_sleep_ms);
+ size_t repaired_dirs = 0;
+ size_t skipped_hashes = 0;
+ for (const auto& duplicate : duplicate_dirs) {
+ if (_close || repaired_dirs >= static_cast<size_t>(max_repairs)) {
+ break;
+ }
+
+ uint64_t canonical_expiration_time = 0;
+ bool should_repair = false;
+ {
+ SCOPED_CACHE_LOCK(_mutex, this);
+ auto file_iter = _files.find(duplicate.hash);
+ if (file_iter == _files.end() || file_iter->second.empty()) {
+ ++skipped_hashes;
+ continue;
+ }
+
+ bool has_canonical = false;
+ bool has_unstable_block = false;
+ bool has_multiple_expiration_times = false;
+ for (auto& [_, cell] : file_iter->second) {
+ std::lock_guard block_lock(cell.file_block->_mutex);
+ if (cell.file_block->state_unlock(block_lock) !=
FileBlock::State::DOWNLOADED) {
+ has_unstable_block = true;
+ break;
+ }
+
+ const auto block_expiration_time =
cell.file_block->expiration_time();
+ if (!has_canonical) {
+ canonical_expiration_time = block_expiration_time;
+ has_canonical = true;
+ } else if (canonical_expiration_time != block_expiration_time)
{
+ has_multiple_expiration_times = true;
+ break;
+ }
+ }
+
+ should_repair =
+ has_canonical && !has_unstable_block &&
!has_multiple_expiration_times &&
+ std::find(duplicate.expiration_times.begin(),
duplicate.expiration_times.end(),
+ canonical_expiration_time) !=
duplicate.expiration_times.end();
+ if (!should_repair) {
+ ++skipped_hashes;
+ }
+ }
+
+ if (!should_repair) {
+ continue;
+ }
+
+ for (auto expiration_time : duplicate.expiration_times) {
+ if (_close || repaired_dirs >= static_cast<size_t>(max_repairs)) {
+ break;
+ }
Review Comment:
This deletion is based on the validation done under `_mutex` above, but the
lock is released before `remove_key_dir()` recursively removes the directory.
That makes the decision staleable: while this thread is between validation and
this call, another thread can remove/recreate the same hash with this
expiration, or retag the hash to this expiration if the duplicate directory is
empty/removed. `LocalFileSystem::delete_directory()` uses `remove_all`, so the
repair pass can then delete a directory that has become the current cache
location and leave the in-memory TTL indexes/block metadata pointing at files
that no longer exist. Please revalidate or serialize the stale-dir removal with
the cache metadata transition, e.g. by holding the cache lock through the
remove or by marking the hash/expiration so it cannot become current between
validation and deletion.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]