diff options
Diffstat (limited to 'contrib/compiler-rt/lib/xray/xray_profile_collector.cc')
-rw-r--r-- | contrib/compiler-rt/lib/xray/xray_profile_collector.cc | 318 |
1 files changed, 318 insertions, 0 deletions
diff --git a/contrib/compiler-rt/lib/xray/xray_profile_collector.cc b/contrib/compiler-rt/lib/xray/xray_profile_collector.cc new file mode 100644 index 000000000000..a43744d9a0cb --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_profile_collector.cc @@ -0,0 +1,318 @@ +//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This implements the interface for the profileCollectorService. +// +//===----------------------------------------------------------------------===// +#include "xray_profile_collector.h" +#include "sanitizer_common/sanitizer_allocator_internal.h" +#include "sanitizer_common/sanitizer_common.h" +#include "sanitizer_common/sanitizer_vector.h" +#include "xray_profiling_flags.h" +#include <memory> +#include <pthread.h> +#include <utility> + +namespace __xray { +namespace profileCollectorService { + +namespace { + +SpinMutex GlobalMutex; +struct ThreadTrie { + tid_t TId; + FunctionCallTrie *Trie; +}; + +struct ProfileBuffer { + void *Data; + size_t Size; +}; + +struct BlockHeader { + u32 BlockSize; + u32 BlockNum; + u64 ThreadId; +}; + +// These need to be pointers that point to heap/internal-allocator-allocated +// objects because these are accessed even at program exit. +Vector<ThreadTrie> *ThreadTries = nullptr; +Vector<ProfileBuffer> *ProfileBuffers = nullptr; +FunctionCallTrie::Allocators *GlobalAllocators = nullptr; + +} // namespace + +void post(const FunctionCallTrie &T, tid_t TId) { + static pthread_once_t Once = PTHREAD_ONCE_INIT; + pthread_once(&Once, +[] { + SpinMutexLock Lock(&GlobalMutex); + GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (GlobalAllocators) FunctionCallTrie::Allocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom( + profilingFlags()->global_allocator_max); + ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>( + InternalAlloc(sizeof(Vector<ThreadTrie>))); + new (ThreadTries) Vector<ThreadTrie>(); + ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>( + InternalAlloc(sizeof(Vector<ProfileBuffer>))); + new (ProfileBuffers) Vector<ProfileBuffer>(); + }); + DCHECK_NE(GlobalAllocators, nullptr); + DCHECK_NE(ThreadTries, nullptr); + DCHECK_NE(ProfileBuffers, nullptr); + + ThreadTrie *Item = nullptr; + { + SpinMutexLock Lock(&GlobalMutex); + if (GlobalAllocators == nullptr) + return; + + Item = ThreadTries->PushBack(); + Item->TId = TId; + + // Here we're using the internal allocator instead of the managed allocator + // because: + // + // 1) We're not using the segmented array data structure to host + // FunctionCallTrie objects. We're using a Vector (from sanitizer_common) + // which works like a std::vector<...> keeping elements contiguous in + // memory. The segmented array data structure assumes that elements are + // trivially destructible, where FunctionCallTrie isn't. + // + // 2) Using a managed allocator means we need to manage that separately, + // which complicates the nature of this code. To get around that, we're + // using the internal allocator instead, which has its own global state + // and is decoupled from the lifetime management required by the managed + // allocator we have in XRay. + // + Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc( + sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie))); + DCHECK_NE(Item->Trie, nullptr); + new (Item->Trie) FunctionCallTrie(*GlobalAllocators); + } + + T.deepCopyInto(*Item->Trie); +} + +// A PathArray represents the function id's representing a stack trace. In this +// context a path is almost always represented from the leaf function in a call +// stack to a root of the call trie. +using PathArray = Array<int32_t>; + +struct ProfileRecord { + using PathAllocator = typename PathArray::AllocatorType; + + // The Path in this record is the function id's from the leaf to the root of + // the function call stack as represented from a FunctionCallTrie. + PathArray *Path = nullptr; + const FunctionCallTrie::Node *Node = nullptr; + + // Constructor for in-place construction. + ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N) + : Path([&] { + auto P = + reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray))); + new (P) PathArray(A); + return P; + }()), + Node(N) {} +}; + +namespace { + +using ProfileRecordArray = Array<ProfileRecord>; + +// Walk a depth-first traversal of each root of the FunctionCallTrie to generate +// the path(s) and the data associated with the path. +static void populateRecords(ProfileRecordArray &PRs, + ProfileRecord::PathAllocator &PA, + const FunctionCallTrie &Trie) { + using StackArray = Array<const FunctionCallTrie::Node *>; + using StackAllocator = typename StackArray::AllocatorType; + StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); + StackArray DFSStack(StackAlloc); + for (const auto R : Trie.getRoots()) { + DFSStack.Append(R); + while (!DFSStack.empty()) { + auto Node = DFSStack.back(); + DFSStack.trim(1); + auto Record = PRs.AppendEmplace(PA, Node); + if (Record == nullptr) + return; + DCHECK_NE(Record, nullptr); + + // Traverse the Node's parents and as we're doing so, get the FIds in + // the order they appear. + for (auto N = Node; N != nullptr; N = N->Parent) + Record->Path->Append(N->FId); + DCHECK(!Record->Path->empty()); + + for (const auto C : Node->Callees) + DFSStack.Append(C.NodePtr); + } + } +} + +static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, + const ProfileRecordArray &ProfileRecords) { + auto NextPtr = static_cast<char *>( + internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + + sizeof(Header); + for (const auto &Record : ProfileRecords) { + // List of IDs follow: + for (const auto FId : *Record.Path) + NextPtr = + static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) + + sizeof(FId); + + // Add the sentinel here. + constexpr int32_t SentinelFId = 0; + NextPtr = static_cast<char *>( + internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + + sizeof(SentinelFId); + + // Add the node data here. + NextPtr = + static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount, + sizeof(Record.Node->CallCount))) + + sizeof(Record.Node->CallCount); + NextPtr = static_cast<char *>( + internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, + sizeof(Record.Node->CumulativeLocalTime))) + + sizeof(Record.Node->CumulativeLocalTime); + } + + DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size); +} + +} // namespace + +void serialize() { + SpinMutexLock Lock(&GlobalMutex); + + // Clear out the global ProfileBuffers. + for (uptr I = 0; I < ProfileBuffers->Size(); ++I) + InternalFree((*ProfileBuffers)[I].Data); + ProfileBuffers->Reset(); + + if (ThreadTries->Size() == 0) + return; + + // Then repopulate the global ProfileBuffers. + for (u32 I = 0; I < ThreadTries->Size(); ++I) { + using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; + ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max); + ProfileRecord::PathAllocator PathAlloc( + profilingFlags()->global_allocator_max); + ProfileRecordArray ProfileRecords(PRAlloc); + + // First, we want to compute the amount of space we're going to need. We'll + // use a local allocator and an __xray::Array<...> to store the intermediary + // data, then compute the size as we're going along. Then we'll allocate the + // contiguous space to contain the thread buffer data. + const auto &Trie = *(*ThreadTries)[I].Trie; + if (Trie.getRoots().empty()) + continue; + populateRecords(ProfileRecords, PathAlloc, Trie); + DCHECK(!Trie.getRoots().empty()); + DCHECK(!ProfileRecords.empty()); + + // Go through each record, to compute the sizes. + // + // header size = block size (4 bytes) + // + block number (4 bytes) + // + thread id (8 bytes) + // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) + // + call count (8 bytes) + // + local time (8 bytes) + // + end of record (8 bytes) + u32 CumulativeSizes = 0; + for (const auto &Record : ProfileRecords) + CumulativeSizes += 20 + (4 * Record.Path->size()); + + BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId}; + auto Buffer = ProfileBuffers->PushBack(); + Buffer->Size = sizeof(Header) + CumulativeSizes; + Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64); + DCHECK_NE(Buffer->Data, nullptr); + serializeRecords(Buffer, Header, ProfileRecords); + + // Now clean up the ProfileRecords array, one at a time. + for (auto &Record : ProfileRecords) { + Record.Path->~PathArray(); + InternalFree(Record.Path); + } + } +} + +void reset() { + SpinMutexLock Lock(&GlobalMutex); + if (ProfileBuffers != nullptr) { + // Clear out the profile buffers that have been serialized. + for (uptr I = 0; I < ProfileBuffers->Size(); ++I) + InternalFree((*ProfileBuffers)[I].Data); + ProfileBuffers->Reset(); + InternalFree(ProfileBuffers); + ProfileBuffers = nullptr; + } + + if (ThreadTries != nullptr) { + // Clear out the function call tries per thread. + for (uptr I = 0; I < ThreadTries->Size(); ++I) { + auto &T = (*ThreadTries)[I]; + T.Trie->~FunctionCallTrie(); + InternalFree(T.Trie); + } + ThreadTries->Reset(); + InternalFree(ThreadTries); + ThreadTries = nullptr; + } + + // Reset the global allocators. + if (GlobalAllocators != nullptr) { + GlobalAllocators->~Allocators(); + InternalFree(GlobalAllocators); + GlobalAllocators = nullptr; + } + GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>( + InternalAlloc(sizeof(FunctionCallTrie::Allocators))); + new (GlobalAllocators) FunctionCallTrie::Allocators(); + *GlobalAllocators = FunctionCallTrie::InitAllocators(); + ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>( + InternalAlloc(sizeof(Vector<ThreadTrie>))); + new (ThreadTries) Vector<ThreadTrie>(); + ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>( + InternalAlloc(sizeof(Vector<ProfileBuffer>))); + new (ProfileBuffers) Vector<ProfileBuffer>(); +} + +XRayBuffer nextBuffer(XRayBuffer B) { + SpinMutexLock Lock(&GlobalMutex); + + if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0) + return {nullptr, 0}; + + if (B.Data == nullptr) + return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size}; + + BlockHeader Header; + internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); + auto NextBlock = Header.BlockNum + 1; + if (NextBlock < ProfileBuffers->Size()) + return {(*ProfileBuffers)[NextBlock].Data, + (*ProfileBuffers)[NextBlock].Size}; + return {nullptr, 0}; +} + +} // namespace profileCollectorService +} // namespace __xray |