aboutsummaryrefslogtreecommitdiff
path: root/contrib/compiler-rt/lib/xray/xray_profile_collector.cc
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/compiler-rt/lib/xray/xray_profile_collector.cc')
-rw-r--r--contrib/compiler-rt/lib/xray/xray_profile_collector.cc318
1 files changed, 318 insertions, 0 deletions
diff --git a/contrib/compiler-rt/lib/xray/xray_profile_collector.cc b/contrib/compiler-rt/lib/xray/xray_profile_collector.cc
new file mode 100644
index 000000000000..a43744d9a0cb
--- /dev/null
+++ b/contrib/compiler-rt/lib/xray/xray_profile_collector.cc
@@ -0,0 +1,318 @@
+//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the interface for the profileCollectorService.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_profile_collector.h"
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_vector.h"
+#include "xray_profiling_flags.h"
+#include <memory>
+#include <pthread.h>
+#include <utility>
+
+namespace __xray {
+namespace profileCollectorService {
+
+namespace {
+
+SpinMutex GlobalMutex;
+struct ThreadTrie {
+ tid_t TId;
+ FunctionCallTrie *Trie;
+};
+
+struct ProfileBuffer {
+ void *Data;
+ size_t Size;
+};
+
+struct BlockHeader {
+ u32 BlockSize;
+ u32 BlockNum;
+ u64 ThreadId;
+};
+
+// These need to be pointers that point to heap/internal-allocator-allocated
+// objects because these are accessed even at program exit.
+Vector<ThreadTrie> *ThreadTries = nullptr;
+Vector<ProfileBuffer> *ProfileBuffers = nullptr;
+FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+
+} // namespace
+
+void post(const FunctionCallTrie &T, tid_t TId) {
+ static pthread_once_t Once = PTHREAD_ONCE_INIT;
+ pthread_once(&Once, +[] {
+ SpinMutexLock Lock(&GlobalMutex);
+ GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+ InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+ new (GlobalAllocators) FunctionCallTrie::Allocators();
+ *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
+ profilingFlags()->global_allocator_max);
+ ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
+ InternalAlloc(sizeof(Vector<ThreadTrie>)));
+ new (ThreadTries) Vector<ThreadTrie>();
+ ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
+ InternalAlloc(sizeof(Vector<ProfileBuffer>)));
+ new (ProfileBuffers) Vector<ProfileBuffer>();
+ });
+ DCHECK_NE(GlobalAllocators, nullptr);
+ DCHECK_NE(ThreadTries, nullptr);
+ DCHECK_NE(ProfileBuffers, nullptr);
+
+ ThreadTrie *Item = nullptr;
+ {
+ SpinMutexLock Lock(&GlobalMutex);
+ if (GlobalAllocators == nullptr)
+ return;
+
+ Item = ThreadTries->PushBack();
+ Item->TId = TId;
+
+ // Here we're using the internal allocator instead of the managed allocator
+ // because:
+ //
+ // 1) We're not using the segmented array data structure to host
+ // FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
+ // which works like a std::vector<...> keeping elements contiguous in
+ // memory. The segmented array data structure assumes that elements are
+ // trivially destructible, where FunctionCallTrie isn't.
+ //
+ // 2) Using a managed allocator means we need to manage that separately,
+ // which complicates the nature of this code. To get around that, we're
+ // using the internal allocator instead, which has its own global state
+ // and is decoupled from the lifetime management required by the managed
+ // allocator we have in XRay.
+ //
+ Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
+ sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
+ DCHECK_NE(Item->Trie, nullptr);
+ new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
+ }
+
+ T.deepCopyInto(*Item->Trie);
+}
+
+// A PathArray represents the function id's representing a stack trace. In this
+// context a path is almost always represented from the leaf function in a call
+// stack to a root of the call trie.
+using PathArray = Array<int32_t>;
+
+struct ProfileRecord {
+ using PathAllocator = typename PathArray::AllocatorType;
+
+ // The Path in this record is the function id's from the leaf to the root of
+ // the function call stack as represented from a FunctionCallTrie.
+ PathArray *Path = nullptr;
+ const FunctionCallTrie::Node *Node = nullptr;
+
+ // Constructor for in-place construction.
+ ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
+ : Path([&] {
+ auto P =
+ reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
+ new (P) PathArray(A);
+ return P;
+ }()),
+ Node(N) {}
+};
+
+namespace {
+
+using ProfileRecordArray = Array<ProfileRecord>;
+
+// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
+// the path(s) and the data associated with the path.
+static void populateRecords(ProfileRecordArray &PRs,
+ ProfileRecord::PathAllocator &PA,
+ const FunctionCallTrie &Trie) {
+ using StackArray = Array<const FunctionCallTrie::Node *>;
+ using StackAllocator = typename StackArray::AllocatorType;
+ StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
+ StackArray DFSStack(StackAlloc);
+ for (const auto R : Trie.getRoots()) {
+ DFSStack.Append(R);
+ while (!DFSStack.empty()) {
+ auto Node = DFSStack.back();
+ DFSStack.trim(1);
+ auto Record = PRs.AppendEmplace(PA, Node);
+ if (Record == nullptr)
+ return;
+ DCHECK_NE(Record, nullptr);
+
+ // Traverse the Node's parents and as we're doing so, get the FIds in
+ // the order they appear.
+ for (auto N = Node; N != nullptr; N = N->Parent)
+ Record->Path->Append(N->FId);
+ DCHECK(!Record->Path->empty());
+
+ for (const auto C : Node->Callees)
+ DFSStack.Append(C.NodePtr);
+ }
+ }
+}
+
+static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
+ const ProfileRecordArray &ProfileRecords) {
+ auto NextPtr = static_cast<char *>(
+ internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
+ sizeof(Header);
+ for (const auto &Record : ProfileRecords) {
+ // List of IDs follow:
+ for (const auto FId : *Record.Path)
+ NextPtr =
+ static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
+ sizeof(FId);
+
+ // Add the sentinel here.
+ constexpr int32_t SentinelFId = 0;
+ NextPtr = static_cast<char *>(
+ internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
+ sizeof(SentinelFId);
+
+ // Add the node data here.
+ NextPtr =
+ static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount,
+ sizeof(Record.Node->CallCount))) +
+ sizeof(Record.Node->CallCount);
+ NextPtr = static_cast<char *>(
+ internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
+ sizeof(Record.Node->CumulativeLocalTime))) +
+ sizeof(Record.Node->CumulativeLocalTime);
+ }
+
+ DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size);
+}
+
+} // namespace
+
+void serialize() {
+ SpinMutexLock Lock(&GlobalMutex);
+
+ // Clear out the global ProfileBuffers.
+ for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
+ InternalFree((*ProfileBuffers)[I].Data);
+ ProfileBuffers->Reset();
+
+ if (ThreadTries->Size() == 0)
+ return;
+
+ // Then repopulate the global ProfileBuffers.
+ for (u32 I = 0; I < ThreadTries->Size(); ++I) {
+ using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
+ ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
+ ProfileRecord::PathAllocator PathAlloc(
+ profilingFlags()->global_allocator_max);
+ ProfileRecordArray ProfileRecords(PRAlloc);
+
+ // First, we want to compute the amount of space we're going to need. We'll
+ // use a local allocator and an __xray::Array<...> to store the intermediary
+ // data, then compute the size as we're going along. Then we'll allocate the
+ // contiguous space to contain the thread buffer data.
+ const auto &Trie = *(*ThreadTries)[I].Trie;
+ if (Trie.getRoots().empty())
+ continue;
+ populateRecords(ProfileRecords, PathAlloc, Trie);
+ DCHECK(!Trie.getRoots().empty());
+ DCHECK(!ProfileRecords.empty());
+
+ // Go through each record, to compute the sizes.
+ //
+ // header size = block size (4 bytes)
+ // + block number (4 bytes)
+ // + thread id (8 bytes)
+ // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
+ // + call count (8 bytes)
+ // + local time (8 bytes)
+ // + end of record (8 bytes)
+ u32 CumulativeSizes = 0;
+ for (const auto &Record : ProfileRecords)
+ CumulativeSizes += 20 + (4 * Record.Path->size());
+
+ BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
+ auto Buffer = ProfileBuffers->PushBack();
+ Buffer->Size = sizeof(Header) + CumulativeSizes;
+ Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
+ DCHECK_NE(Buffer->Data, nullptr);
+ serializeRecords(Buffer, Header, ProfileRecords);
+
+ // Now clean up the ProfileRecords array, one at a time.
+ for (auto &Record : ProfileRecords) {
+ Record.Path->~PathArray();
+ InternalFree(Record.Path);
+ }
+ }
+}
+
+void reset() {
+ SpinMutexLock Lock(&GlobalMutex);
+ if (ProfileBuffers != nullptr) {
+ // Clear out the profile buffers that have been serialized.
+ for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
+ InternalFree((*ProfileBuffers)[I].Data);
+ ProfileBuffers->Reset();
+ InternalFree(ProfileBuffers);
+ ProfileBuffers = nullptr;
+ }
+
+ if (ThreadTries != nullptr) {
+ // Clear out the function call tries per thread.
+ for (uptr I = 0; I < ThreadTries->Size(); ++I) {
+ auto &T = (*ThreadTries)[I];
+ T.Trie->~FunctionCallTrie();
+ InternalFree(T.Trie);
+ }
+ ThreadTries->Reset();
+ InternalFree(ThreadTries);
+ ThreadTries = nullptr;
+ }
+
+ // Reset the global allocators.
+ if (GlobalAllocators != nullptr) {
+ GlobalAllocators->~Allocators();
+ InternalFree(GlobalAllocators);
+ GlobalAllocators = nullptr;
+ }
+ GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
+ InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
+ new (GlobalAllocators) FunctionCallTrie::Allocators();
+ *GlobalAllocators = FunctionCallTrie::InitAllocators();
+ ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
+ InternalAlloc(sizeof(Vector<ThreadTrie>)));
+ new (ThreadTries) Vector<ThreadTrie>();
+ ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
+ InternalAlloc(sizeof(Vector<ProfileBuffer>)));
+ new (ProfileBuffers) Vector<ProfileBuffer>();
+}
+
+XRayBuffer nextBuffer(XRayBuffer B) {
+ SpinMutexLock Lock(&GlobalMutex);
+
+ if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
+ return {nullptr, 0};
+
+ if (B.Data == nullptr)
+ return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
+
+ BlockHeader Header;
+ internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
+ auto NextBlock = Header.BlockNum + 1;
+ if (NextBlock < ProfileBuffers->Size())
+ return {(*ProfileBuffers)[NextBlock].Data,
+ (*ProfileBuffers)[NextBlock].Size};
+ return {nullptr, 0};
+}
+
+} // namespace profileCollectorService
+} // namespace __xray