This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 4a28d3e2592 [fix](cloud) fix read cache block file return stat NOT_FOUND (pick #43220) (#43512) 4a28d3e2592 is described below commit 4a28d3e25921f17db61a14aafecab96cb363cdbc Author: zhengyu <freeman.zhang1...@gmail.com> AuthorDate: Fri Nov 8 21:17:41 2024 +0800 [fix](cloud) fix read cache block file return stat NOT_FOUND (pick #43220) (#43512) pick #43220 pick #43220 pick #43220 Previously, cache blocks could be downloaded as different types than the target type we intended to read, leading to false cache misses. E.g., a block might be downloaded as an 'idx' type when the current context expected a 'ttl' type. The root cause of this problem is the original design encoding meta info such as type and expiration time into cache block file path and readers of this cache block file have inconsistent view of the type so they use different name to locate file and run into error in the end. This commit tries other type if the initial type failed to locate the file (return NOT_FOUND). Be ware that this is a nasty quick fix. We will elimite the metadate encoded in the file path in the near future to get rid of all the path related problems. --- be/src/common/config.cpp | 2 - be/src/common/config.h | 2 - be/src/io/cache/cached_remote_file_reader.cpp | 2 + be/src/io/cache/fs_file_cache_storage.cpp | 55 ++++++++++++++++++--------- be/src/io/cache/fs_file_cache_storage.h | 3 ++ 5 files changed, 41 insertions(+), 23 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 7de013bb7a5..44aa5e3a31f 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1040,8 +1040,6 @@ DEFINE_mInt64(file_cache_ttl_valid_check_interval_second, "0"); // zero for not // If true, evict the ttl cache using LRU when full. // Otherwise, only expiration can evict ttl and new data won't add to cache when full. DEFINE_Bool(enable_ttl_cache_evict_using_lru, "true"); -// rename ttl filename to new format during read, with some performance cost -DEFINE_mBool(translate_to_new_ttl_format_during_read, "false"); DEFINE_mBool(enbale_dump_error_file, "true"); // limit the max size of error log on disk DEFINE_mInt64(file_cache_error_log_limit_bytes, "209715200"); // 200MB diff --git a/be/src/common/config.h b/be/src/common/config.h index e84cdb5a44f..05fd878808a 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1081,8 +1081,6 @@ DECLARE_mInt64(file_cache_ttl_valid_check_interval_second); // If true, evict the ttl cache using LRU when full. // Otherwise, only expiration can evict ttl and new data won't add to cache when full. DECLARE_Bool(enable_ttl_cache_evict_using_lru); -// rename ttl filename to new format during read, with some performance cost -DECLARE_Bool(translate_to_new_ttl_format_during_read); DECLARE_mBool(enbale_dump_error_file); // limit the max size of error log on disk DECLARE_mInt64(file_cache_error_log_limit_bytes); diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index 0a46c98390e..c9a273c5d36 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -292,6 +292,8 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* file_offset); } if (!st || block_state != FileBlock::State::DOWNLOADED) { + LOG(WARNING) << "Read data failed from file cache downloaded by others. err=" + << st.msg() << ", block state=" << block_state; size_t bytes_read {0}; stats.hit_cache = false; s3_read_counter << 1; diff --git a/be/src/io/cache/fs_file_cache_storage.cpp b/be/src/io/cache/fs_file_cache_storage.cpp index bacd0820c66..8471f6e12b4 100644 --- a/be/src/io/cache/fs_file_cache_storage.cpp +++ b/be/src/io/cache/fs_file_cache_storage.cpp @@ -160,30 +160,36 @@ Status FSFileCacheStorage::read(const FileCacheKey& key, size_t value_offset, Sl get_path_in_local_cache(get_path_in_local_cache(key.hash, key.meta.expiration_time), key.offset, key.meta.type); Status s = fs->open_file(file, &file_reader); - if (!s.ok()) { - if (!s.is<ErrorCode::NOT_FOUND>() || key.meta.type != FileCacheType::TTL) { - return s; + + // handle the case that the file is not found but actually exists in other type format + // TODO(zhengyu): nasty! better eliminate the type encoding in file name in the future + if (!s.ok() && !s.is<ErrorCode::NOT_FOUND>()) { + LOG(WARNING) << "open file failed, file=" << file << ", error=" << s.to_string(); + return s; // return other error directly + } else if (!s.ok() && s.is<ErrorCode::NOT_FOUND>()) { // but handle NOT_FOUND error + auto candidates = get_path_in_local_cache_all_candidates( + get_path_in_local_cache(key.hash, key.meta.expiration_time), key.offset); + for (auto& candidate : candidates) { + s = fs->open_file(candidate, &file_reader); + if (s.ok()) { + break; // success with one of there candidates + } } - std::string file_old_format = get_path_in_local_cache_old_ttl_format( - get_path_in_local_cache(key.hash, key.meta.expiration_time), key.offset, - key.meta.type); - if (config::translate_to_new_ttl_format_during_read) { - // try to rename the file with old ttl format to new and retry - VLOG(7) << "try to rename the file with old ttl format to new and retry" - << " oldformat=" << file_old_format << " original=" << file; - RETURN_IF_ERROR(fs->rename(file_old_format, file)); - RETURN_IF_ERROR(fs->open_file(file, &file_reader)); - } else { - // try to open the file with old ttl format - VLOG(7) << "try to open the file with old ttl format" - << " oldformat=" << file_old_format << " original=" << file; - RETURN_IF_ERROR(fs->open_file(file_old_format, &file_reader)); + if (!s.ok()) { // still not found, return error + LOG(WARNING) << "open file failed, file=" << file << ", error=" << s.to_string(); + return s; } - } + } // else, s.ok() means open file success + FDCache::instance()->insert_file_reader(fd_key, file_reader); } size_t bytes_read = 0; - RETURN_IF_ERROR(file_reader->read_at(value_offset, buffer, &bytes_read)); + auto s = file_reader->read_at(value_offset, buffer, &bytes_read); + if (!s.ok()) { + LOG(WARNING) << "read file failed, file=" << file_reader->path() + << ", error=" << s.to_string(); + return s; + } DCHECK(bytes_read == buffer.get_size()); return Status::OK(); } @@ -270,6 +276,17 @@ std::string FSFileCacheStorage::get_path_in_local_cache_old_ttl_format(const std return Path(dir) / (std::to_string(offset) + BlockFileCache::cache_type_to_string(type)); } +std::vector<std::string> FSFileCacheStorage::get_path_in_local_cache_all_candidates( + const std::string& dir, size_t offset) { + std::vector<std::string> candidates; + std::string base = get_path_in_local_cache(dir, offset, FileCacheType::NORMAL); + candidates.push_back(base); + candidates.push_back(base + "_idx"); + candidates.push_back(base + "_ttl"); + candidates.push_back(base + "_disposable"); + return candidates; +} + std::string FSFileCacheStorage::get_path_in_local_cache(const UInt128Wrapper& value, uint64_t expiration_time) const { auto str = value.to_string(); diff --git a/be/src/io/cache/fs_file_cache_storage.h b/be/src/io/cache/fs_file_cache_storage.h index 23e98f422ac..ac5d10d0430 100644 --- a/be/src/io/cache/fs_file_cache_storage.h +++ b/be/src/io/cache/fs_file_cache_storage.h @@ -101,6 +101,9 @@ private: void load_cache_info_into_memory(BlockFileCache* _mgr) const; + [[nodiscard]] std::vector<std::string> get_path_in_local_cache_all_candidates( + const std::string& dir, size_t offset); + std::string _cache_base_path; std::thread _cache_background_load_thread; const std::shared_ptr<LocalFileSystem>& fs = global_local_filesystem(); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org