sammccall updated this revision to Diff 167435.
sammccall added a comment.
Herald added a subscriber: mgrang.
Tests, tweaks to fix edge cases.
Repository:
rC Clang
https://reviews.llvm.org/D52549
Files:
include/clang/Basic/VirtualFileSystem.h
lib/Basic/AvoidStatsVFS.cpp
lib/Basic/CMakeLists.txt
lib/Basic/VirtualFileSystem.cpp
unittests/Basic/VirtualFileSystemTest.cpp
Index: unittests/Basic/VirtualFileSystemTest.cpp
===================================================================
--- unittests/Basic/VirtualFileSystemTest.cpp
+++ unittests/Basic/VirtualFileSystemTest.cpp
@@ -1616,3 +1616,139 @@
EXPECT_EQ(3, NumDiagnostics);
}
+
+class AvoidStatsVFSTest : public ::testing::Test {
+ protected:
+ struct Counts {
+ unsigned Status = 0;
+ unsigned OpenFile = 0;
+ unsigned ListDir = 0;
+ };
+ Counts Outer, Inner;
+ IntrusiveRefCntPtr<vfs::InMemoryFileSystem> MemFS;
+
+ IntrusiveRefCntPtr<vfs::FileSystem> reset() {
+ Outer = {};
+ Inner = {};
+ MemFS = new vfs::InMemoryFileSystem();
+ return new CountingFS(
+ Outer, vfs::avoidStats(new CountingFS(Inner, MemFS)).release());
+ }
+
+ void touch(const Twine &Path) {
+ MemFS->addFile(Path, 0, MemoryBuffer::getMemBuffer(""));
+ }
+
+ private:
+ class CountingFS : public vfs::ProxyFileSystem {
+ Counts &C;
+
+ public:
+ CountingFS(Counts &C, IntrusiveRefCntPtr<vfs::FileSystem> Base)
+ : ProxyFileSystem(Base), C(C) {}
+
+ llvm::ErrorOr<std::unique_ptr<vfs::File>>
+ openFileForRead(const Twine &Path) override {
+ ++C.OpenFile;
+ return ProxyFileSystem::openFileForRead(Path);
+ }
+
+ vfs::directory_iterator dir_begin(const Twine &Dir,
+ std::error_code &EC) override {
+ ++C.ListDir;
+ return ProxyFileSystem::dir_begin(Dir, EC);
+ }
+
+ llvm::ErrorOr<vfs::Status> status(const Twine &Path) override {
+ ++C.Status;
+ return ProxyFileSystem::status(Path);
+ }
+ };
+};
+
+TEST_F(AvoidStatsVFSTest, MissingFileNoStat) {
+ auto FS = reset();
+ touch("/foo/a");
+ touch("/foo/c");
+ touch("/foo/f");
+
+ EXPECT_FALSE(FS->status("/foo/a").getError()); // Just stat, /foo wanted once.
+ EXPECT_TRUE(FS->status("/foo/b").getError()); // List /foo, b known missing
+ EXPECT_FALSE(FS->status("/foo/c").getError()); // c exists, must stat
+ EXPECT_TRUE(FS->status("/foo/d").getError()); // d known missing
+ EXPECT_TRUE(FS->openFileForRead("/foo/e").getError()); // e known missing
+ EXPECT_FALSE(FS->openFileForRead("/foo/f").getError()); // f exists, must open
+
+ EXPECT_EQ(Outer.Status, 4u);
+ EXPECT_EQ(Outer.OpenFile, 2u);
+ EXPECT_EQ(Outer.ListDir, 0u);
+ EXPECT_EQ(Inner.Status, 2u);
+ EXPECT_EQ(Inner.OpenFile, 1u);
+ EXPECT_EQ(Inner.ListDir, 1u);
+}
+
+TEST_F(AvoidStatsVFSTest, ParentDirsMissing) {
+ auto FS = reset();
+ touch("/a/b/z");
+
+ EXPECT_TRUE(FS->status("/a/b/1").getError()); // Just stat.
+ EXPECT_TRUE(FS->status("/a/b/2").getError()); // List /a/b
+ EXPECT_TRUE(FS->status("/a/b/3").getError()); // Known missing
+ EXPECT_TRUE(FS->status("/a/b/c/d").getError()); // Known missing
+ EXPECT_TRUE(FS->status("/a/b/z/d").getError()); // Parent is a file
+ EXPECT_EQ(Outer.Status, 5u);
+ EXPECT_EQ(Outer.ListDir, 0u);
+ EXPECT_EQ(Inner.Status, 1u);
+ EXPECT_EQ(Inner.ListDir, 1u);
+}
+
+TEST_F(AvoidStatsVFSTest, HugeDir) {
+ auto FS = reset();
+ for (int I = 0; I < 10000; ++I)
+ touch("/big/" + Twine(I));
+
+ EXPECT_FALSE(FS->status("/big/1").getError()); // Just stat.
+ EXPECT_FALSE(FS->status("/big/9998").getError()); // Try to list, fail, stat.
+ EXPECT_FALSE(FS->status("/big/9999").getError()); // stat, don't list
+ EXPECT_TRUE(FS->status("/big/10001").getError());
+ EXPECT_TRUE(FS->status("/big/x/10001").getError());
+ EXPECT_TRUE(FS->status("/big/1/10001").getError()); // missing parent in cache
+
+ EXPECT_EQ(Outer.Status, 6u);
+ EXPECT_EQ(Outer.ListDir, 0u);
+ EXPECT_EQ(Inner.Status, 5u);
+ EXPECT_EQ(Inner.ListDir, 1u);
+}
+
+TEST_F(AvoidStatsVFSTest, Duplicate) {
+ // Stat + stat on a missing file is one operation.
+ auto FS = reset();
+ EXPECT_TRUE(FS->status("/x").getError()); // Just stat.
+ EXPECT_TRUE(FS->status("/x").getError()); // No need to stat or list again.
+ EXPECT_EQ(Inner.Status, 1u);
+ EXPECT_EQ(Inner.ListDir, 0u);
+
+ // Same with open + open.
+ FS = reset();
+ EXPECT_TRUE(FS->openFileForRead("/x").getError());
+ EXPECT_TRUE(FS->openFileForRead("/x").getError());
+ EXPECT_EQ(Inner.OpenFile, 1u);
+ EXPECT_EQ(Inner.ListDir, 0u);
+
+ // And stat + open.
+ FS = reset();
+ EXPECT_TRUE(FS->status("/x").getError());
+ EXPECT_TRUE(FS->openFileForRead("/x").getError());
+ EXPECT_EQ(Inner.Status, 1u);
+ EXPECT_EQ(Inner.OpenFile, 0u);
+ EXPECT_EQ(Inner.ListDir, 0u);
+
+ // But open + stat is two: as far as open() knows, it might be a dir.
+ // We list the directory (second attempt) and see that it's not.
+ FS = reset();
+ EXPECT_TRUE(FS->openFileForRead("/x").getError());
+ EXPECT_TRUE(FS->status("/x").getError());
+ EXPECT_EQ(Inner.OpenFile, 1u);
+ EXPECT_EQ(Inner.Status, 0u);
+ EXPECT_EQ(Inner.ListDir, 1u);
+}
Index: lib/Basic/VirtualFileSystem.cpp
===================================================================
--- lib/Basic/VirtualFileSystem.cpp
+++ lib/Basic/VirtualFileSystem.cpp
@@ -303,11 +303,6 @@
return llvm::sys::fs::real_path(Path, Output);
}
-IntrusiveRefCntPtr<FileSystem> vfs::getRealFileSystem() {
- static IntrusiveRefCntPtr<FileSystem> FS = new RealFileSystem();
- return FS;
-}
-
namespace {
class RealFSDirIter : public clang::vfs::detail::DirIterImpl {
@@ -2141,3 +2136,9 @@
return *this;
}
+
+IntrusiveRefCntPtr<FileSystem> vfs::getRealFileSystem() {
+ static IntrusiveRefCntPtr<FileSystem> FS(
+ avoidStats(new RealFileSystem()).release());
+ return FS;
+}
Index: lib/Basic/CMakeLists.txt
===================================================================
--- lib/Basic/CMakeLists.txt
+++ lib/Basic/CMakeLists.txt
@@ -46,6 +46,7 @@
add_clang_library(clangBasic
Attributes.cpp
+ AvoidStatsVFS.cpp
Builtins.cpp
CharInfo.cpp
Cuda.cpp
Index: lib/Basic/AvoidStatsVFS.cpp
===================================================================
--- /dev/null
+++ lib/Basic/AvoidStatsVFS.cpp
@@ -0,0 +1,368 @@
+//===- AvoidStatsVFS.cpp - Implements a stat-reducing VFS wrapper ---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// We implement a VFS wrapper that uses directory listings to prune status() and
+// open() operations for files that *do not exist*.
+//
+// These operations are common in clang because of include paths.
+// With an include path of {A, B, ...}, an #include <foo/bar> directive results
+// in attempts to open or stat {A/foo/bar, B/foo/bar, ...}.
+//
+// This is expensive because this typically takes one syscall per path, so we
+// have O(NumIncludes * IncludePathLen) syscalls.
+//
+// To optimize this, we can list parent directories such as A/.
+// If A only contains {config.h}, attempts to open A/foo/bar, A/foo/baz, A/qux
+// can all fail instantly.
+// Listing a directory tends to be a single syscall, and in practice most
+// missing files can be recognized by looking at the same few directories.
+// In practice the number of syscalls is O(NumIncludes + IncludePathLen).
+//
+// Our constant factor is higher, but this improves performance for large TUs
+// with many include paths.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/VirtualFileSystem.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Path.h"
+#include <mutex>
+#define DEBUG_TYPE "AvoidStatsVFS"
+
+namespace clang {
+namespace vfs {
+namespace {
+using namespace llvm;
+namespace path = llvm::sys::path;
+using llvm::sys::fs::file_type;
+
+class AvoidStatVFS : public ProxyFileSystem {
+public:
+ // In fact we only read a directory once we've wanted its contents several
+ // times. This avoids a common pattern:
+ // - we read files under /some/long/path/...
+ // - we don't read anything else under /some/long/...
+ // - we readdir(/some/long), which tells us that /some/long/path is a dir
+ // - we still need to readdir(/some/long/path) to get its contents
+ // - that also tells us it's a directory, so readdir(/some/long) was useless
+ // We refuse to do IO to understand each dir until we reach this threshold.
+ constexpr static unsigned ReadDirThreshold = 2;
+
+ AvoidStatVFS(IntrusiveRefCntPtr<FileSystem> Base)
+ : ProxyFileSystem(std::move(Base)) {}
+
+ llvm::ErrorOr<std::unique_ptr<vfs::File>>
+ openFileForRead(const Twine &Path) override {
+ LLVM_DEBUG(dump("before open " + Path));
+ auto G = make_scope_exit([&] { LLVM_DEBUG(dump("after open " + Path)); });
+
+ auto NormPath = normalizePath(Path);
+ {
+ std::lock_guard<std::mutex> Lock(CacheMu);
+ if (!mayBeFile</*OrDir=*/false>(NormPath))
+ return std::make_error_code(std::errc::no_such_file_or_directory);
+ }
+ LLVM_DEBUG(llvm::dbgs() << "passthru open " << Path << "\n");
+ auto Result = ProxyFileSystem::openFileForRead(Path);
+
+ std::lock_guard<std::mutex> Lock(CacheMu);
+ recordState(NormPath, Result ? File : MissingOrDir);
+ return Result;
+ }
+
+ llvm::ErrorOr<Status> status(const Twine &Path) override {
+ LLVM_DEBUG(dump("before status " + Path));
+ auto G = make_scope_exit([&] { LLVM_DEBUG(dump("after status " + Path)); });
+
+ auto NormPath = normalizePath(Path);
+ {
+ std::lock_guard<std::mutex> Lock(CacheMu);
+ if (!mayBeFile</*OrDir=*/true>(NormPath))
+ return std::make_error_code(std::errc::no_such_file_or_directory);
+ }
+ LLVM_DEBUG(llvm::dbgs() << "passthru status " << Path << "\n");
+ auto Result = ProxyFileSystem::status(Path);
+
+ std::lock_guard<std::mutex> Lock(CacheMu);
+ recordState(NormPath, Result
+ ? (Result->getType() == file_type::directory_file
+ ? DirUnknownChildren
+ : File)
+ : Missing);
+ return Result;
+ }
+
+private:
+ // What we know about a path, which may be partial information.
+ // Values are ordered: higher values have more info and "upgrade" lower ones.
+ enum PathState {
+ // Ambiguous states.
+ Unknown,
+ MissingOrDir,
+ MissingOrFile,
+ // Type known, but not all relevant details.
+ DirUnknownChildren,
+ DirUnenumerable, // unknown children, but don't try to enumerate again!
+ // Complete information.
+ DirKnownChildren,
+ File,
+ Missing,
+ };
+
+ // The cache always uses absolute paths with ./ and ../ segments removed.
+ SmallString<256> normalizePath(const Twine &Path) {
+ SmallString<256> Result;
+ Path.toVector(Result);
+ makeAbsolute(Result);
+ path::remove_dots(Result, /*remove_dot_dot=*/true);
+ return Result;
+ }
+
+ // Helpers used in LLVM_DEBUG() logging.
+#ifndef NDEBUG
+ void dump(const Twine &When) {
+ llvm::dbgs() << "AvoidStatsVFS " << When << ":\n";
+ std::vector<std::pair<StringRef, PathState>> V;
+ for (const auto& E : Cache)
+ V.emplace_back(E.getKey(), E.getValue());
+ llvm::sort(V.begin(), V.end());
+ for (const auto &E : V)
+ llvm::dbgs() << " - " << E.first << " = " << str(E.second) << "\n";
+ }
+
+ const char* str(PathState S) {
+ switch (S) {
+ case Unknown: return "Unknown";
+ case MissingOrDir: return "MissingOrDir";
+ case MissingOrFile: return "MissingOrFile";
+ case DirUnknownChildren: return "DirUnknownChildren";
+ case DirUnenumerable: return "DirUnenumerable";
+ case DirKnownChildren: return "DirKnownChildren";
+ case File: return "File";
+ case Missing: return "Missing";
+ }
+ }
+#endif
+
+ // Returns false if we know NormPath points to a file that, based on cached
+ // information about the path and its parents.
+ // May retrieve extra information about parent directories to better answer.
+ // If the function returns true, NormPath may or may not be a file.
+ // If OrDir is true, both files and directories are acceptable "files".
+ // REQUIRES: CacheMu is held.
+ template <bool OrDir>
+ bool mayBeFile(StringRef NormPath) {
+ // First, just see if we can answer from the cache.
+ switch (Cache.lookup(NormPath)) {
+ case Unknown:
+ case MissingOrFile:
+ break;
+ case MissingOrDir:
+ if (!OrDir)
+ return false;
+ break;
+ case DirUnknownChildren:
+ case DirUnenumerable:
+ case DirKnownChildren:
+ return OrDir;
+ case File:
+ return true;
+ case Missing:
+ return false;
+ }
+
+ // Next, maybe we can get an answer based on the parent directory.
+ auto Parent = path::parent_path(NormPath);
+ if (Parent.empty())
+ return OrDir; // Root is a directory.
+ // We may need to populate its cache entry.
+ if (!populateCacheForDir(Parent))
+ return true; // No more information.
+
+ switch (Cache.lookup(Parent)) {
+ case Unknown:
+ case MissingOrDir:
+ case DirUnknownChildren:
+ llvm_unreachable("populateCacheForDir didn't provide enough info");
+ case MissingOrFile:
+ case File:
+ case Missing:
+ return false;
+ case DirUnenumerable:
+ return true;
+ case DirKnownChildren:
+ // The point: we listed the parent, all children are now in cache.
+ switch (Cache.lookup(NormPath)) {
+ case MissingOrDir:
+ case MissingOrFile:
+ llvm_unreachable("populateCacheForDir didn't provide child info");
+ case Unknown:
+ case Missing:
+ return false;
+ case DirUnknownChildren:
+ case DirUnenumerable:
+ case DirKnownChildren:
+ return OrDir;
+ case File:
+ return true;
+ };
+ }
+ }
+
+ // Record new information about a path's state.
+ // Combines appropriately with existing cached information (won't overwrite
+ // a more specific state with less specific state).
+ // REQUIRES: CacheMu is held.
+ void recordState(StringRef NormPath, PathState State) {
+ auto& Current = Cache[NormPath];
+ // Sherlock Holmes would be proud of this special case.
+ if ((Current == MissingOrDir && State == MissingOrFile) ||
+ (Current == MissingOrFile && State == MissingOrDir)) {
+ Current = Missing;
+ return;
+ }
+ Current = std::max(Current, State);
+ switch (State) {
+ case Unknown:
+ case MissingOrDir:
+ case MissingOrFile:
+ case Missing:
+ break;
+ case DirUnenumerable:
+ case DirUnknownChildren:
+ case DirKnownChildren:
+ case File:
+ auto Parent = path::parent_path(NormPath);
+ if (!Parent.empty())
+ recordState(Parent, DirUnknownChildren);
+ break;
+ }
+ }
+
+ // Attempts to populate the cache with information about a directory.
+ // Roughly: if the directory children are not known, readdir() to fill it.
+ //
+ // Details are messy:
+ // - If we know the directory doesn't exist, don't read from it.
+ // - We may need to populate parent caches in order to know that.
+ // - We may decline to populate the cache because the directory wasn't
+ // accessed enough yet.
+ //
+ // After returning, exactly one of the following is true:
+ // - we returned false, and declined to populate the cache
+ // - Cache[NormPath] indicates NormPath is not a directory
+ // - Cache[NormPath] == DirKnownChildren, all children are in cache
+ // - Cache[NormPath] == DirUnenumerable
+ //
+ // If AllowIO is false, we *only* attempt to propagate information already in
+ // cache. Only AllowIO=true calls count towards the ReadDirThreshold.
+ //
+ // REQUIRES: CacheMu is held.
+ bool populateCacheForDir(StringRef NormPath, bool AllowIO = true) {
+ // First, just see if we have any work to do.
+ bool ParentMightHelp = true;
+ switch (Cache.lookup(NormPath)) {
+ case Unknown:
+ case MissingOrDir:
+ break; // Need to populate cache with more info.
+ case DirUnknownChildren:
+ ParentMightHelp = false; // Listing parent won't detail our children.
+ break;
+ case MissingOrFile:
+ case DirUnenumerable:
+ case DirKnownChildren:
+ case File:
+ case Missing:
+ return true; // Cache already satisfies postcondition.
+ }
+ if (AllowIO && ++ReadDirAttempts[NormPath] < ReadDirThreshold)
+ AllowIO = false;
+ // Next, populate parent info and see if that determines our state.
+ auto Parent = path::parent_path(NormPath);
+ if (ParentMightHelp && !Parent.empty() &&
+ populateCacheForDir(Parent, AllowIO)) {
+ switch (Cache.lookup(Parent)) {
+ case Unknown:
+ case MissingOrDir:
+ case DirUnknownChildren:
+ llvm_unreachable("populateCacheForDir didn't provide enough info");
+ case MissingOrFile:
+ case File:
+ case Missing:
+ // Child can't exist if parent is not a directory.
+ recordState(NormPath, Missing);
+ return true;
+ case DirUnenumerable:
+ break; // No info about child, need to read it.
+ case DirKnownChildren:
+ // Parent is a directory, and we can tell whether child is.
+ switch (Cache.lookup(NormPath)) {
+ case MissingOrDir:
+ case MissingOrFile:
+ llvm_unreachable("populateCacheForDir didn't provide child info");
+ case Unknown:
+ recordState(NormPath, Missing);
+ return true;
+ case DirUnknownChildren:
+ break; // Need to list children.
+ case DirUnenumerable:
+ case DirKnownChildren:
+ // populateCacheForDir shouldn't do this, but we might be racing.
+ return true;
+ case File:
+ case Missing:
+ return true; // Cache now satisfies postcondition.
+ }
+ break;
+ }
+ }
+
+ // No other sources of info, we have to list the directory.
+ if (!AllowIO)
+ return false;
+ // Unlock around IO - no accessing the cache!
+ CacheMu.unlock();
+ LLVM_DEBUG(llvm::dbgs() << "synthetic readdir " << NormPath << "\n");
+ std::vector<std::pair<std::string, file_type>> DirListing;
+ std::error_code EC;
+ for (auto It = dir_begin(NormPath, EC);
+ !EC && It != directory_iterator();) {
+ DirListing.emplace_back(It->path(), It->type());
+ It.increment(EC);
+ if (DirListing.size() == 100)
+ EC = std::make_error_code(std::errc::result_out_of_range);
+ }
+ CacheMu.lock();
+
+ // Record the results of the directory listing.
+ for (const auto& Entry : DirListing)
+ recordState(Entry.first, Entry.second == file_type::directory_file
+ ? DirUnknownChildren
+ : File);
+ recordState(NormPath,
+ EC ? DirListing.empty() ? MissingOrFile : DirUnenumerable
+ : DirKnownChildren);
+ return true;
+ }
+
+ std::mutex CacheMu;
+ StringMap<PathState> Cache;
+ StringMap<unsigned> ReadDirAttempts;
+};
+} // namespace
+
+std::unique_ptr<FileSystem>
+avoidStats(llvm::IntrusiveRefCntPtr<FileSystem> FS) {
+ return llvm::make_unique<AvoidStatVFS>(std::move(FS));
+}
+
+} // namespace vfs
+} // namespace clang
Index: include/clang/Basic/VirtualFileSystem.h
===================================================================
--- include/clang/Basic/VirtualFileSystem.h
+++ include/clang/Basic/VirtualFileSystem.h
@@ -461,6 +461,20 @@
std::error_code setCurrentWorkingDirectory(const Twine &Path) override;
};
+/// Wrap a filesystem to avoid calling status() and open() on missing files.
+///
+/// When files are accessed, the FS lists ancestor directories and caches their
+/// contents. This allows subsequent status() and open() calls for sibling
+/// nonexistent paths to fail without making any syscalls.
+///
+/// This is likely to improve performance when a large fraction of open()ed or
+/// status()ed files do not exist, and their paths are related.
+/// Notably: resolving #included files with a long include path.
+///
+/// This wrapper never invalidates cached information, so is only suitable when
+/// the underlying FS is assumed to be static.
+std::unique_ptr<FileSystem> avoidStats(llvm::IntrusiveRefCntPtr<FileSystem>);
+
/// Get a globally unique ID for a virtual file or directory.
llvm::sys::fs::UniqueID getNextVirtualUniqueID();
_______________________________________________
cfe-commits mailing list
[email protected]
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits