https://github.com/mtrofin created https://github.com/llvm/llvm-project/pull/133147
None >From 7182baeef88e3d9448062118fd8af808a17fbcd9 Mon Sep 17 00:00:00 2001 From: Mircea Trofin <mtro...@google.com> Date: Mon, 24 Mar 2025 12:01:10 -0700 Subject: [PATCH] RootAutodetect --- compiler-rt/lib/ctx_profile/CMakeLists.txt | 2 +- .../lib/ctx_profile/CtxInstrContextNode.h | 1 + .../lib/ctx_profile/CtxInstrProfiling.cpp | 119 +++++++++++------- .../lib/ctx_profile/CtxInstrProfiling.h | 2 +- .../lib/ctx_profile/RootAutoDetector.cpp | 84 +++++++++++++ .../lib/ctx_profile/RootAutoDetector.h | 29 +++++ .../TestCases/generate-context.cpp | 4 +- .../llvm/ProfileData/CtxInstrContextNode.h | 1 + 8 files changed, 195 insertions(+), 47 deletions(-) diff --git a/compiler-rt/lib/ctx_profile/CMakeLists.txt b/compiler-rt/lib/ctx_profile/CMakeLists.txt index bb606449c61b1..446ebc96408dd 100644 --- a/compiler-rt/lib/ctx_profile/CMakeLists.txt +++ b/compiler-rt/lib/ctx_profile/CMakeLists.txt @@ -27,7 +27,7 @@ endif() add_compiler_rt_runtime(clang_rt.ctx_profile STATIC ARCHS ${CTX_PROFILE_SUPPORTED_ARCH} - OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc + OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc RTSanitizerCommonSymbolizer CFLAGS ${EXTRA_FLAGS} SOURCES ${CTX_PROFILE_SOURCES} ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS} diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h index a42bf9ebb01ea..aa052bc7eea6c 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h +++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h @@ -127,6 +127,7 @@ class ContextNode final { /// MUTEXDECL takes one parameter, the name of a field that is a mutex. #define CTXPROF_FUNCTION_DATA(PTRDECL, VOLATILE_PTRDECL, MUTEXDECL) \ PTRDECL(FunctionData, Next) \ + PTRDECL(void, EntryAddress) \ VOLATILE_PTRDECL(ContextRoot, CtxRoot) \ VOLATILE_PTRDECL(ContextNode, FlatCtx) \ MUTEXDECL(Mutex) diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp index da291e0bbabdd..7e73214e639a3 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "CtxInstrProfiling.h" +#include "RootAutoDetector.h" #include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_atomic_clang.h" @@ -43,6 +44,12 @@ Arena *FlatCtxArena = nullptr; __thread bool IsUnderContext = false; __sanitizer::atomic_uint8_t ProfilingStarted = {}; +__sanitizer::atomic_uintptr_t RootDetector = {}; +RootAutoDetector *getRootDetector() { + return reinterpret_cast<RootAutoDetector *>( + __sanitizer::atomic_load_relaxed(&RootDetector)); +} + // utility to taint a pointer by setting the LSB. There is an assumption // throughout that the addresses of contexts are even (really, they should be // align(8), but "even"-ness is the minimum assumption) @@ -201,7 +208,7 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, return Ret; } -ContextNode *getFlatProfile(FunctionData &Data, GUID Guid, +ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid, uint32_t NumCounters) { if (ContextNode *Existing = Data.FlatCtx) return Existing; @@ -232,6 +239,7 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid, auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0); Data.FlatCtx = Ret; + Data.EntryAddress = Callee; Data.Next = reinterpret_cast<FunctionData *>( __sanitizer::atomic_load_relaxed(&AllFunctionsData)); while (!__sanitizer::atomic_compare_exchange_strong( @@ -277,8 +285,29 @@ ContextRoot *FunctionData::getOrAllocateContextRoot() { return Root; } -ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid, - uint32_t NumCounters) { +ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid, + uint32_t Counters, uint32_t Callsites) + SANITIZER_NO_THREAD_SAFETY_ANALYSIS { + IsUnderContext = true; + __sanitizer::atomic_fetch_add(&Root->TotalEntries, 1, + __sanitizer::memory_order_relaxed); + + if (!Root->FirstMemBlock) { + setupContext(Root, Guid, Counters, Callsites); + } + if (Root->Taken.TryLock()) { + __llvm_ctx_profile_current_context_root = Root; + onContextEnter(*Root->FirstNode); + return Root->FirstNode; + } + // If this thread couldn't take the lock, return scratch context. + __llvm_ctx_profile_current_context_root = nullptr; + return TheScratchContext; +} + +ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid, + uint32_t NumCounters, uint32_t NumCallsites, + ContextRoot *CtxRoot) { // 1) if we are currently collecting a contextual profile, fetch a ContextNode // in the `Unhandled` set. We want to do this regardless of `ProfilingStarted` @@ -297,27 +326,30 @@ ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid, // entered once and never exit. They should be assumed to be entered before // profiling starts - because profiling should start after the server is up // and running (which is equivalent to "message pumps are set up"). - ContextRoot *R = __llvm_ctx_profile_current_context_root; - if (!R) { + if (!CtxRoot) { + if (auto *RAD = getRootDetector()) + RAD->sample(); if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted)) return TheScratchContext; else return markAsScratch( - onContextEnter(*getFlatProfile(Data, Guid, NumCounters))); + onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters))); } - auto [Iter, Ins] = R->Unhandled.insert({Guid, nullptr}); + auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr}); if (Ins) - Iter->second = - getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0); + Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode, + NumCounters, 0); return markAsScratch(onContextEnter(*Iter->second)); } ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, GUID Guid, uint32_t NumCounters, uint32_t NumCallsites) { + auto *CtxRoot = __llvm_ctx_profile_current_context_root; // fast "out" if we're not even doing contextual collection. - if (!__llvm_ctx_profile_current_context_root) - return getUnhandledContext(*Data, Guid, NumCounters); + if (!CtxRoot) + return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, + nullptr); // also fast "out" if the caller is scratch. We can see if it's scratch by // looking at the interior pointer into the subcontexts vector that the caller @@ -326,7 +358,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, // precisely, aligned - 8 values) auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]); if (!CallsiteContext || isScratch(CallsiteContext)) - return getUnhandledContext(*Data, Guid, NumCounters); + return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, + CtxRoot); // if the callee isn't the expected one, return scratch. // Signal handler(s) could have been invoked at any point in the execution. @@ -344,7 +377,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, // for that case. auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]); if (ExpectedCallee != Callee) - return getUnhandledContext(*Data, Guid, NumCounters); + return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, + CtxRoot); auto *Callsite = *CallsiteContext; // in the case of indirect calls, we will have all seen targets forming a @@ -366,40 +400,26 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, return Ret; } -ContextNode *__llvm_ctx_profile_start_context( - FunctionData *FData, GUID Guid, uint32_t Counters, - uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { - IsUnderContext = true; - - auto *Root = FData->getOrAllocateContextRoot(); - - __sanitizer::atomic_fetch_add(&Root->TotalEntries, 1, - __sanitizer::memory_order_relaxed); +ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid, + uint32_t Counters, + uint32_t Callsites) { - if (!Root->FirstMemBlock) { - setupContext(Root, Guid, Counters, Callsites); - } - if (Root->Taken.TryLock()) { - __llvm_ctx_profile_current_context_root = Root; - onContextEnter(*Root->FirstNode); - return Root->FirstNode; - } - // If this thread couldn't take the lock, return scratch context. - __llvm_ctx_profile_current_context_root = nullptr; - return TheScratchContext; + return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid, + Counters, Callsites); } void __llvm_ctx_profile_release_context(FunctionData *FData) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { + const auto *CurrentRoot = __llvm_ctx_profile_current_context_root; + if (!CurrentRoot || FData->CtxRoot != CurrentRoot) + return; IsUnderContext = false; - if (__llvm_ctx_profile_current_context_root) { - __llvm_ctx_profile_current_context_root = nullptr; - assert(FData->CtxRoot); - FData->CtxRoot->Taken.Unlock(); - } + assert(FData->CtxRoot); + __llvm_ctx_profile_current_context_root = nullptr; + FData->CtxRoot->Taken.Unlock(); } -void __llvm_ctx_profile_start_collection() { +void __llvm_ctx_profile_start_collection(bool AutodetectRoots) { size_t NumMemUnits = 0; __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); @@ -415,12 +435,24 @@ void __llvm_ctx_profile_start_collection() { resetContextNode(*Root->FirstUnhandledCalleeNode); __sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0); } - __sanitizer::atomic_store_relaxed(&ProfilingStarted, true); - __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits); + if (AutodetectRoots) { + auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector))) + RootAutoDetector(AllFunctionsData, RootDetector); + RD->start(); + } else { + __sanitizer::atomic_store_relaxed(&ProfilingStarted, true); + __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits); + } } bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) { __sanitizer::atomic_store_relaxed(&ProfilingStarted, false); + if (auto *RD = getRootDetector()) { + __sanitizer::Printf("[ctxprof] Expected the root autodetector to have " + "finished well before attempting to fetch a context"); + RD->join(); + } + __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( &AllContextsMutex); @@ -445,8 +477,9 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) { const auto *Pos = reinterpret_cast<const FunctionData *>( __sanitizer::atomic_load_relaxed(&AllFunctionsData)); for (; Pos; Pos = Pos->Next) - Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(), - Pos->FlatCtx->counters_size()); + if (!Pos->CtxRoot) + Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(), + Pos->FlatCtx->counters_size()); Writer.endFlatSection(); return true; } diff --git a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h index 6326beaa53085..220a8bd25e6ef 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h +++ b/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h @@ -207,7 +207,7 @@ ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *FData, /// Prepares for collection. Currently this resets counter values but preserves /// internal context tree structure. -void __llvm_ctx_profile_start_collection(); +void __llvm_ctx_profile_start_collection(bool AutodetectRoots = false); /// Completely free allocated memory. void __llvm_ctx_profile_free(); diff --git a/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp b/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp index 7daa8f31e16ea..5888545a79d65 100644 --- a/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp +++ b/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp @@ -18,6 +18,90 @@ using namespace __ctx_profile; +namespace __sanitizer { +void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context, + bool request_fast, u32 max_depth) { + // We can't implement the fast variant. The fast variant ends up invoking an + // external allocator, because of pthread_attr_getstack. If this happens + // during an allocation of the program being instrumented, a non-reentrant + // lock may be taken (this was observed). The allocator called by + // pthread_attr_getstack will also try to take that lock. + UnwindSlow(pc, max_depth); +} +} // namespace __sanitizer + +RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) { + GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex); + Parent.AllSamples.PushBack(this); +} + +void RootAutoDetector::start() { + atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this)); + pthread_create( + &WorkerThread, nullptr, + +[](void *Ctx) -> void * { + RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx); + SleepForSeconds(30); + Vector<PerThreadSamples*> Copy; + { + GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex); + Copy.Resize(RAD->AllSamples.Size()); + for (uptr I = 0; I < RAD->AllSamples.Size(); ++I) + Copy[I] = RAD->AllSamples[I]; + } + DenseMap<uptr, uint64_t> AllRoots; + for (uptr I = 0; I < Copy.Size(); ++I) { + GenericScopedLock<SpinMutex>(&Copy[I]->M); + Copy[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) { + auto [FAddr, Count] = KVP; + AllRoots[FAddr] += Count; + return true; + }); + } + for (auto *FD = reinterpret_cast<FunctionData *>( + atomic_load_relaxed(&RAD->FunctionDataListHead)); + FD; FD = FD->Next) { + if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) { + GenericScopedLock<SpinMutex> M(&FD->Mutex); + FD->getOrAllocateContextRoot(); + } + } + atomic_store_relaxed(&RAD->Self, 0); + return nullptr; + }, + this); +} + +void RootAutoDetector::join() { + pthread_join(WorkerThread, nullptr); +} + +void RootAutoDetector::sample() { + static thread_local bool Entered = false; + static thread_local uint64_t Entries = 0; + if (Entered || (++Entries % SampleRate)) + return; + Entered = true; + collectStack(); + Entered = false; +} + +void RootAutoDetector::collectStack() { + GET_CALLER_PC_BP; + BufferedStackTrace CurrentStack; + CurrentStack.Unwind(pc, bp, nullptr, false); + if (CurrentStack.size <= 2) return; + static thread_local PerThreadSamples *ThisThreadSamples = + new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples))) + PerThreadSamples(*this); + + if (!ThisThreadSamples->M.TryLock()) + return; + + ThisThreadSamples->TrieRoot.insertStack(CurrentStack); + ThisThreadSamples->M.Unlock(); +} + uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const { // this requires --linkopt=-Wl,--export-dynamic Dl_info Info; diff --git a/compiler-rt/lib/ctx_profile/RootAutoDetector.h b/compiler-rt/lib/ctx_profile/RootAutoDetector.h index ab51a342d3617..254a40b163632 100644 --- a/compiler-rt/lib/ctx_profile/RootAutoDetector.h +++ b/compiler-rt/lib/ctx_profile/RootAutoDetector.h @@ -12,6 +12,7 @@ #include "sanitizer_common/sanitizer_dense_map.h" #include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_stacktrace.h" +#include "sanitizer_common/sanitizer_vector.h" #include <pthread.h> #include <sanitizer/common_interface_defs.h> @@ -64,5 +65,33 @@ class PerThreadCallsiteTrie { const Trie &start() const { return T; } }; + +class RootAutoDetector final { + static const uint64_t SampleRate = 6113; + pthread_t WorkerThread; + + struct PerThreadSamples { + PerThreadSamples(RootAutoDetector &Parent); + + PerThreadCallsiteTrie TrieRoot; + SpinMutex M; + }; + SpinMutex AllSamplesMutex; + SANITIZER_GUARDED_BY(AllSamplesMutex) + Vector<PerThreadSamples*> AllSamples; + atomic_uintptr_t &FunctionDataListHead; + atomic_uintptr_t &Self; + void collectStack(); + +public: + RootAutoDetector(atomic_uintptr_t &FunctionDataListHead, + atomic_uintptr_t &Self) + : FunctionDataListHead(FunctionDataListHead), Self(Self) {} + + void sample(); + void start(); + void join(); +}; + } // namespace __ctx_profile #endif diff --git a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp index 3dc53637a35d8..7c0d7804ff4a4 100644 --- a/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp +++ b/compiler-rt/test/ctx_profile/TestCases/generate-context.cpp @@ -16,7 +16,7 @@ #include <iostream> using namespace llvm::ctx_profile; -extern "C" void __llvm_ctx_profile_start_collection(); +extern "C" void __llvm_ctx_profile_start_collection(bool); extern "C" bool __llvm_ctx_profile_fetch(ProfileWriter &); // avoid name mangling @@ -159,7 +159,7 @@ bool profileWriter() { } int main(int argc, char **argv) { - __llvm_ctx_profile_start_collection(); + __llvm_ctx_profile_start_collection(false); theRoot(); flatFct(); // This would be implemented in a specific RPC handler, but here we just call diff --git a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h index a42bf9ebb01ea..aa052bc7eea6c 100644 --- a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h +++ b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h @@ -127,6 +127,7 @@ class ContextNode final { /// MUTEXDECL takes one parameter, the name of a field that is a mutex. #define CTXPROF_FUNCTION_DATA(PTRDECL, VOLATILE_PTRDECL, MUTEXDECL) \ PTRDECL(FunctionData, Next) \ + PTRDECL(void, EntryAddress) \ VOLATILE_PTRDECL(ContextRoot, CtxRoot) \ VOLATILE_PTRDECL(ContextNode, FlatCtx) \ MUTEXDECL(Mutex) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits