Merge "profiling: Improve bookeeping."
diff --git a/protos/perfetto/trace/profiling/profile_packet.proto b/protos/perfetto/trace/profiling/profile_packet.proto
index f6bf95c..f1eb617 100644
--- a/protos/perfetto/trace/profiling/profile_packet.proto
+++ b/protos/perfetto/trace/profiling/profile_packet.proto
@@ -62,6 +62,8 @@
// bytes freed at this frame and in functions called by it.
optional uint64 cumulative_freed = 3;
optional uint64 timestamp = 4; // timestamp [opt]
+ optional uint64 alloc_count = 5;
+ optional uint64 free_count = 6;
}
repeated ProcessHeapSamples process_dumps = 5;
diff --git a/src/profiling/memory/bookkeeping.cc b/src/profiling/memory/bookkeeping.cc
index b819222..04d6dfb 100644
--- a/src/profiling/memory/bookkeeping.cc
+++ b/src/profiling/memory/bookkeeping.cc
@@ -69,9 +69,19 @@
}
}
- GlobalCallstackTrie::Node* node =
- callsites_->IncrementCallsite(callstack, size);
- allocations_.emplace(address, Allocation(size, sequence_number, node));
+ GlobalCallstackTrie::Node* node = callsites_->CreateCallsite(callstack);
+
+ auto callstack_allocations_it = callstack_allocations_.find(node);
+ if (callstack_allocations_it == callstack_allocations_.end()) {
+ GlobalCallstackTrie::IncrementNode(node);
+ bool inserted;
+ std::tie(callstack_allocations_it, inserted) =
+ callstack_allocations_.emplace(node, node);
+ PERFETTO_DCHECK(inserted);
+ }
+ allocations_.emplace(
+ address,
+ Allocation(size, sequence_number, &(callstack_allocations_it->second)));
// Keep the sequence tracker consistent.
RecordFree(kNoopFree, sequence_number);
@@ -111,48 +121,77 @@
void HeapTracker::Dump(
ProfilePacket::ProcessHeapSamples* proto,
std::set<GlobalCallstackTrie::Node*>* callstacks_to_dump) {
- for (const auto& p : allocations_) {
- const Allocation& alloc = p.second;
+ // There are two reasons we remove the unused callstack allocations on the
+ // next iteration of Dump:
+ // * We need to remove them after the callstacks were dumped, which currently
+ // happens after the allocations are dumped.
+ // * This way, we do not destroy and recreate callstacks as frequently.
+ for (auto it_and_alloc : dead_callstack_allocations_) {
+ auto& it = it_and_alloc.first;
+ uint64_t allocated = it_and_alloc.second;
+ const CallstackAllocations& alloc = it->second;
+ if (alloc.allocation_count == allocated && alloc.free_count == allocated)
+ callstack_allocations_.erase(it);
+ }
+ dead_callstack_allocations_.clear();
+
+ for (auto it = callstack_allocations_.begin();
+ it != callstack_allocations_.end(); ++it) {
+ const CallstackAllocations& alloc = it->second;
callstacks_to_dump->emplace(alloc.node);
ProfilePacket::HeapSample* sample = proto->add_samples();
sample->set_callstack_id(alloc.node->id());
- sample->set_cumulative_allocated(alloc.total_size);
+ sample->set_cumulative_allocated(alloc.allocated);
+ sample->set_cumulative_freed(alloc.freed);
+ sample->set_alloc_count(alloc.allocation_count);
+ sample->set_free_count(alloc.free_count);
+
+ if (alloc.allocation_count == alloc.free_count)
+ dead_callstack_allocations_.emplace_back(it, alloc.allocation_count);
}
}
-uint64_t GlobalCallstackTrie::GetCumSizeForTesting(
+uint64_t HeapTracker::GetSizeForTesting(
+ const std::vector<unwindstack::FrameData>& stack) {
+ GlobalCallstackTrie::Node* node = callsites_->CreateCallsite(stack);
+ // Hack to make it go away again if it wasn't used before.
+ // This is only good because this is used for testing only.
+ GlobalCallstackTrie::IncrementNode(node);
+ GlobalCallstackTrie::DecrementNode(node);
+ auto it = callstack_allocations_.find(node);
+ if (it == callstack_allocations_.end()) {
+ return 0;
+ }
+ const CallstackAllocations& alloc = it->second;
+ return alloc.allocated - alloc.freed;
+}
+
+GlobalCallstackTrie::Node* GlobalCallstackTrie::CreateCallsite(
const std::vector<unwindstack::FrameData>& callstack) {
Node* node = &root_;
for (const unwindstack::FrameData& loc : callstack) {
- node = node->children_.Get(InternCodeLocation(loc));
- if (node == nullptr)
- return 0;
- }
- return node->cum_size_;
-}
-
-GlobalCallstackTrie::Node* GlobalCallstackTrie::IncrementCallsite(
- const std::vector<unwindstack::FrameData>& callstack,
- uint64_t size) {
- Node* node = &root_;
- node->cum_size_ += size;
- for (const unwindstack::FrameData& loc : callstack) {
node = node->GetOrCreateChild(InternCodeLocation(loc));
- node->cum_size_ += size;
}
return node;
}
-void GlobalCallstackTrie::DecrementNode(Node* node, uint64_t size) {
- PERFETTO_DCHECK(node->cum_size_ >= size);
+void GlobalCallstackTrie::IncrementNode(Node* node) {
+ while (node != nullptr) {
+ node->ref_count_ += 1;
+ node = node->parent_;
+ }
+}
+
+void GlobalCallstackTrie::DecrementNode(Node* node) {
+ PERFETTO_DCHECK(node->ref_count_ >= 1);
bool delete_prev = false;
Node* prev = nullptr;
while (node != nullptr) {
if (delete_prev)
node->children_.Remove(*prev);
- node->cum_size_ -= size;
- delete_prev = node->cum_size_ == 0;
+ node->ref_count_ -= 1;
+ delete_prev = node->ref_count_ == 0;
prev = node;
node = node->parent_;
}
diff --git a/src/profiling/memory/bookkeeping.h b/src/profiling/memory/bookkeeping.h
index 755329b..0bd849a 100644
--- a/src/profiling/memory/bookkeeping.h
+++ b/src/profiling/memory/bookkeeping.h
@@ -29,6 +29,67 @@
#include "src/profiling/memory/interner.h"
#include "src/profiling/memory/queue_messages.h"
+// Below is an illustration of the bookkeeping system state where
+// PID 1 does the following allocations:
+// 0x123: 128 bytes at [bar main]
+// 0x234: 128 bytes at [bar main]
+// 0xf00: 512 bytes at [foo main]
+// PID 1 allocated but previously freed 1024 bytes at [bar main]
+//
+// PID 2 does the following allocations:
+// 0x345: 512 bytes at [foo main]
+// 0x456: 32 bytes at [foo main]
+// PID 2 allocated but already freed 1235 bytes at [foo main]
+// PID 2 allocated and freed 2048 bytes in main.
+//
+// +---------------------------------+ +-------------------+
+// | +---------+ HeapTracker PID 1| | GlobalCallstackTri|
+// | |0x123 128+---+ +----------+ | | +---+ |
+// | | | +---->alloc:1280+----------------->bar| |
+// | |0x234 128+---+ |free: 1024| | | +-^-+ |
+// | | | +----------+ | | +---+ ^ |
+// | |0xf00 512+---+ | +----->foo| | |
+// | +--------+| | +----------+ | | | +-^-+ | |
+// | +---->alloc: 512+---+ | | | |
+// | |free: 0| | | | +--+----+ |
+// | +----------+ | | | | |
+// | | | | +-+--+ |
+// +---------------------------------+ | | |main| |
+// | | +--+-+ |
+// +---------------------------------+ | | ^ |
+// | +---------+ HeapTracker PID 2| | +-------------------+
+// | |0x345 512+---+ +----------+ | | |
+// | | | +---->alloc:1779+---+ |
+// | |0x456 32+---+ |free: 1235| | |
+// | +---------+ +----------+ | |
+// | | |
+// | +----------+ | |
+// | |alloc:2048+---------------+
+// | |free: 2048| |
+// | +----------+ |
+// | |
+// +---------------------------------+
+// Allocation CallstackAllocations Node
+//
+// The active allocations are on the leftmost side, modeled as the class
+// HeapTracker::Allocation.
+//
+// The total allocated and freed bytes per callsite are in the middle, modeled
+// as the HeapTracker::CallstackAllocations class.
+// Note that (1280 - 1024) = 256, so alloc - free is equal to the total of the
+// currently active allocations.
+// Note in PID 2 there is a CallstackAllocations with 2048 allocated and 2048
+// freed bytes. This is not currently referenced by any Allocations (as it
+// should, as 2048 - 2048 = 0, which would mean that the total size of the
+// allocations referencing it should be 0). This is because we haven't dumped
+// this state yet, so the CallstackAllocations will be kept around until the
+// next dump, written to the trace, and then destroyed.
+//
+// On the right hand side is the GlobalCallstackTrie, with nodes representing
+// distinct callstacks. They have no information about the currently allocated
+// or freed bytes, they only contain a reference count to destroy them as
+// soon as they are no longer referenced by a HeapTracker.
+
namespace perfetto {
namespace profiling {
@@ -103,7 +164,7 @@
private:
Node* GetOrCreateChild(const Interner<Frame>::Interned& loc);
- uint64_t cum_size_ = 0;
+ uint64_t ref_count_ = 0;
Node* const parent_;
const Interner<Frame>::Interned location_;
base::LookupSet<Node, const Interner<Frame>::Interned, &Node::location_>
@@ -114,11 +175,9 @@
GlobalCallstackTrie(const GlobalCallstackTrie&) = delete;
GlobalCallstackTrie& operator=(const GlobalCallstackTrie&) = delete;
- uint64_t GetCumSizeForTesting(
- const std::vector<unwindstack::FrameData>& stack);
- Node* IncrementCallsite(const std::vector<unwindstack::FrameData>& locs,
- uint64_t size);
- static void DecrementNode(Node* node, uint64_t size);
+ Node* CreateCallsite(const std::vector<unwindstack::FrameData>& locs);
+ static void DecrementNode(Node* node);
+ static void IncrementNode(Node* node);
private:
Interner<Frame>::Interned InternCodeLocation(
@@ -161,29 +220,57 @@
void Dump(protos::pbzero::ProfilePacket::ProcessHeapSamples* proto,
std::set<GlobalCallstackTrie::Node*>* callstacks_to_dump);
+ uint64_t GetSizeForTesting(const std::vector<unwindstack::FrameData>& stack);
+
private:
static constexpr uint64_t kNoopFree = 0;
+
+ struct CallstackAllocations {
+ CallstackAllocations(GlobalCallstackTrie::Node* n) : node(n) {}
+
+ uint64_t allocated = 0;
+ uint64_t freed = 0;
+ uint64_t allocation_count = 0;
+ uint64_t free_count = 0;
+
+ GlobalCallstackTrie::Node* node;
+
+ ~CallstackAllocations() {
+ if (node)
+ GlobalCallstackTrie::DecrementNode(node);
+ }
+
+ bool operator<(const CallstackAllocations& other) const {
+ return node < other.node;
+ }
+ };
+
struct Allocation {
- Allocation(uint64_t size, uint64_t seq, GlobalCallstackTrie::Node* n)
- : total_size(size), sequence_number(seq), node(n) {}
+ Allocation(uint64_t size, uint64_t seq, CallstackAllocations* csa)
+ : total_size(size), sequence_number(seq), callstack_allocations(csa) {
+ callstack_allocations->allocation_count++;
+ callstack_allocations->allocated += total_size;
+ }
Allocation() = default;
Allocation(const Allocation&) = delete;
Allocation(Allocation&& other) noexcept {
total_size = other.total_size;
sequence_number = other.sequence_number;
- node = other.node;
- other.node = nullptr;
+ callstack_allocations = other.callstack_allocations;
+ other.callstack_allocations = nullptr;
}
~Allocation() {
- if (node)
- GlobalCallstackTrie::DecrementNode(node, total_size);
+ if (callstack_allocations) {
+ callstack_allocations->free_count++;
+ callstack_allocations->freed += total_size;
+ }
}
uint64_t total_size;
uint64_t sequence_number;
- GlobalCallstackTrie::Node* node;
+ CallstackAllocations* callstack_allocations;
};
// Sequencing logic works as following:
@@ -209,6 +296,15 @@
// commited to |allocations_|.
void CommitFree(uint64_t sequence_number, uint64_t address);
+ // We cannot use an interner here, because after the last allocation goes
+ // away, we still need to keep the CallstackAllocations around until the next
+ // dump.
+ std::map<GlobalCallstackTrie::Node*, CallstackAllocations>
+ callstack_allocations_;
+
+ std::vector<std::pair<decltype(callstack_allocations_)::iterator, uint64_t>>
+ dead_callstack_allocations_;
+
// Address -> (size, sequence_number, code location)
std::map<uint64_t, Allocation> allocations_;
diff --git a/src/profiling/memory/bookkeeping_unittest.cc b/src/profiling/memory/bookkeeping_unittest.cc
index 8049ae6..fe545d1 100644
--- a/src/profiling/memory/bookkeeping_unittest.cc
+++ b/src/profiling/memory/bookkeeping_unittest.cc
@@ -49,15 +49,6 @@
return res;
}
-std::vector<unwindstack::FrameData> topframe() {
- std::vector<unwindstack::FrameData> res;
- unwindstack::FrameData data{};
- data.function_name = "fun1";
- data.map_name = "map1";
- res.emplace_back(std::move(data));
- return res;
-}
-
TEST(BookkeepingTest, Basic) {
uint64_t sequence_number = 1;
GlobalCallstackTrie c;
@@ -65,11 +56,14 @@
hd.RecordMalloc(stack(), 1, 5, sequence_number++);
hd.RecordMalloc(stack2(), 2, 2, sequence_number++);
- ASSERT_EQ(c.GetCumSizeForTesting(topframe()), 7);
+ ASSERT_EQ(hd.GetSizeForTesting(stack()), 5);
+ ASSERT_EQ(hd.GetSizeForTesting(stack2()), 2);
hd.RecordFree(2, sequence_number++);
- ASSERT_EQ(c.GetCumSizeForTesting(topframe()), 5);
+ ASSERT_EQ(hd.GetSizeForTesting(stack()), 5);
+ ASSERT_EQ(hd.GetSizeForTesting(stack2()), 0);
hd.RecordFree(1, sequence_number++);
- ASSERT_EQ(c.GetCumSizeForTesting(topframe()), 0);
+ ASSERT_EQ(hd.GetSizeForTesting(stack()), 0);
+ ASSERT_EQ(hd.GetSizeForTesting(stack2()), 0);
}
TEST(BookkeepingTest, TwoHeapTrackers) {
@@ -80,10 +74,11 @@
HeapTracker hd2(&c);
hd.RecordMalloc(stack(), 1, 5, sequence_number++);
- hd2.RecordMalloc(stack2(), 2, 2, sequence_number++);
- ASSERT_EQ(c.GetCumSizeForTesting(topframe()), 7);
+ hd2.RecordMalloc(stack(), 2, 2, sequence_number++);
+ ASSERT_EQ(hd2.GetSizeForTesting(stack()), 2);
+ ASSERT_EQ(hd.GetSizeForTesting(stack()), 5);
}
- ASSERT_EQ(c.GetCumSizeForTesting(topframe()), 5);
+ ASSERT_EQ(hd.GetSizeForTesting(stack()), 5);
}
TEST(BookkeepingTest, ReplaceAlloc) {
@@ -93,8 +88,8 @@
hd.RecordMalloc(stack(), 1, 5, sequence_number++);
hd.RecordMalloc(stack2(), 1, 2, sequence_number++);
- EXPECT_EQ(c.GetCumSizeForTesting(stack()), 0);
- EXPECT_EQ(c.GetCumSizeForTesting(stack2()), 2);
+ EXPECT_EQ(hd.GetSizeForTesting(stack()), 0);
+ EXPECT_EQ(hd.GetSizeForTesting(stack2()), 2);
}
TEST(BookkeepingTest, OutOfOrder) {
@@ -103,8 +98,8 @@
hd.RecordMalloc(stack(), 1, 5, 1);
hd.RecordMalloc(stack2(), 1, 2, 0);
- EXPECT_EQ(c.GetCumSizeForTesting(stack()), 5);
- EXPECT_EQ(c.GetCumSizeForTesting(stack2()), 0);
+ EXPECT_EQ(hd.GetSizeForTesting(stack()), 5);
+ EXPECT_EQ(hd.GetSizeForTesting(stack2()), 0);
}
} // namespace