cambyzju commented on code in PR #12897:
URL: https://github.com/apache/doris/pull/12897#discussion_r990634022


##########
be/src/io/cache/file_cache_manager.cpp:
##########
@@ -56,88 +80,82 @@ void FileCacheManager::remove_file_cache(const std::string& 
cache_path) {
     }
 }
 
-void FileCacheManager::clean_timeout_caches() {
-    std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock);
-    for (std::map<std::string, FileCachePtr>::const_iterator iter = 
_file_cache_map.cbegin();
-         iter != _file_cache_map.cend(); ++iter) {
-        if (iter->second == nullptr) {
-            continue;
+void 
FileCacheManager::_add_file_cache_for_gc_by_disk(std::vector<GCContextPerDisk>& 
contexts,
+                                                      FileCachePtr file_cache) 
{
+    // sort file cache by last match time
+    if (config::file_cache_max_size_per_disk > 0) {
+        auto file_size = file_cache->cache_file_size();
+        if (file_size <= 0) {
+            return;
+        }
+        for (size_t i = 0; i < contexts.size(); ++i) {
+            if (contexts[i].try_add_file_cache(file_cache, file_size)) {
+                break;
+            }
         }
-        iter->second->clean_timeout_cache();
     }
 }
+void FileCacheManager::gc_file_caches() {
+    int64_t gc_conf_size = config::file_cache_max_size_per_disk;
+    std::vector<GCContextPerDisk> contexts;
+    // init for GC by disk size
+    if (gc_conf_size > 0) {
+        std::vector<DataDir*> data_dirs = 
doris::StorageEngine::instance()->get_stores();
+        contexts.resize(data_dirs.size());
+        for (size_t i = 0; i < contexts.size(); ++i) {
+            contexts[i].init(data_dirs[i]->path(), gc_conf_size);
+        }
+    }
 
-void FileCacheManager::clean_timeout_file_not_in_mem(const std::string& 
cache_path) {
-    time_t now = time(nullptr);
+    // process unused file caches
     std::shared_lock<std::shared_mutex> rdlock(_cache_map_lock);
-    // Deal with caches not in _file_cache_map
-    if (_file_cache_map.find(cache_path) == _file_cache_map.end()) {
-        std::vector<Path> cache_file_names;
-        if (io::global_local_filesystem()->list(cache_path, 
&cache_file_names).ok()) {
-            std::map<std::string, bool> cache_names;
-            std::list<std::string> done_names;
-            for (Path cache_file_name : cache_file_names) {
-                std::string filename = cache_file_name.native();
-                if (!ends_with(filename, CACHE_DONE_FILE_SUFFIX)) {
-                    cache_names[filename] = true;
-                    continue;
-                }
-                done_names.push_back(filename);
-                std::stringstream done_file_ss;
-                done_file_ss << cache_path << "/" << filename;
-                std::string done_file_path = done_file_ss.str();
-                time_t m_time;
-                if (!FileUtils::mtime(done_file_path, &m_time).ok()) {
-                    continue;
-                }
-                if (now - m_time < config::file_cache_alive_time_sec) {
-                    continue;
-                }
-                std::string cache_file_path =
-                        StringReplace(done_file_path, CACHE_DONE_FILE_SUFFIX, 
"", true);
-                LOG(INFO) << "Delete timeout done_cache_path: " << 
done_file_path
-                          << ", cache_file_path: " << cache_file_path << ", 
m_time: " << m_time;
-                if 
(!io::global_local_filesystem()->delete_file(done_file_path).ok()) {
-                    LOG(ERROR) << "delete_file failed: " << done_file_path;
+    std::vector<TabletSharedPtr> tablets =
+            StorageEngine::instance()->tablet_manager()->get_all_tablet();
+    for (const auto& tablet : tablets) {
+        std::vector<Path> seg_file_paths;
+        if (io::global_local_filesystem()->list(tablet->tablet_path(), 
&seg_file_paths).ok()) {
+            for (Path seg_file : seg_file_paths) {
+                std::string seg_filename = seg_file.native();
+                // check if it is a dir name
+                if (ends_with(seg_filename, ".dat")) {
                     continue;
                 }
-                if 
(!io::global_local_filesystem()->delete_file(cache_file_path).ok()) {
-                    LOG(ERROR) << "delete_file failed: " << cache_file_path;
+                // skip file cache already in memory
+                std::stringstream ss;
+                ss << tablet->tablet_path() << "/" << seg_filename;
+                std::string cache_path = ss.str();
+                if (_file_cache_map.find(cache_path) != _file_cache_map.end()) 
{
                     continue;
                 }
+
+                auto file_cache = std::make_shared<DummyFileCache>(
+                        cache_path, config::file_cache_alive_time_sec);
+                // load cache meta from disk and clean unfinished cache files
+                file_cache->load_and_clean();
+                // policy1: GC file cache by timeout
+                file_cache->clean_timeout_cache();
+                // sort file cache by last match time
+                _add_file_cache_for_gc_by_disk(contexts, file_cache);
             }
-            // find cache file without done file.
-            for (std::list<std::string>::iterator itr = done_names.begin(); 
itr != done_names.end();
-                 ++itr) {
-                std::string cache_filename = StringReplace(*itr, 
CACHE_DONE_FILE_SUFFIX, "", true);
-                if (cache_names.find(cache_filename) != cache_names.end()) {
-                    cache_names.erase(cache_filename);
-                }
-            }
-            // remove cache file without done file
-            for (std::map<std::string, bool>::iterator itr = 
cache_names.begin();
-                 itr != cache_names.end(); ++itr) {
-                std::stringstream cache_file_ss;
-                cache_file_ss << cache_path << "/" << itr->first;
-                std::string cache_file_path = cache_file_ss.str();
-                time_t m_time;
-                if (!FileUtils::mtime(cache_file_path, &m_time).ok()) {
-                    continue;
-                }
-                if (now - m_time < config::file_cache_alive_time_sec) {
-                    continue;
-                }
-                LOG(INFO) << "Delete cache file without done file: " << 
cache_file_path;
-                if 
(!io::global_local_filesystem()->delete_file(cache_file_path).ok()) {
-                    LOG(ERROR) << "delete_file failed: " << cache_file_path;
-                }
-            }
-            if (io::global_local_filesystem()->list(cache_path, 
&cache_file_names).ok() &&
-                cache_file_names.size() == 0) {
-                if 
(global_local_filesystem()->delete_directory(cache_path).ok()) {
-                    LOG(INFO) << "Delete empty dir: " << cache_path;
-                }
-            }
+        }
+    }
+
+    // process file caches in memory

Review Comment:
   line 114 already added rdlock, and the scope includes these lines.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to