Merge "profiling: Improve bookeeping."
diff --git a/protos/perfetto/trace/profiling/profile_packet.proto b/protos/perfetto/trace/profiling/profile_packet.proto
index f6bf95c..f1eb617 100644
--- a/protos/perfetto/trace/profiling/profile_packet.proto
+++ b/protos/perfetto/trace/profiling/profile_packet.proto
@@ -62,6 +62,8 @@
     // bytes freed at this frame and in functions called by it.
     optional uint64 cumulative_freed = 3;
     optional uint64 timestamp = 4;  // timestamp [opt]
+    optional uint64 alloc_count = 5;
+    optional uint64 free_count = 6;
   }
 
   repeated ProcessHeapSamples process_dumps = 5;
diff --git a/src/profiling/memory/bookkeeping.cc b/src/profiling/memory/bookkeeping.cc
index b819222..04d6dfb 100644
--- a/src/profiling/memory/bookkeeping.cc
+++ b/src/profiling/memory/bookkeeping.cc
@@ -69,9 +69,19 @@
     }
   }
 
-  GlobalCallstackTrie::Node* node =
-      callsites_->IncrementCallsite(callstack, size);
-  allocations_.emplace(address, Allocation(size, sequence_number, node));
+  GlobalCallstackTrie::Node* node = callsites_->CreateCallsite(callstack);
+
+  auto callstack_allocations_it = callstack_allocations_.find(node);
+  if (callstack_allocations_it == callstack_allocations_.end()) {
+    GlobalCallstackTrie::IncrementNode(node);
+    bool inserted;
+    std::tie(callstack_allocations_it, inserted) =
+        callstack_allocations_.emplace(node, node);
+    PERFETTO_DCHECK(inserted);
+  }
+  allocations_.emplace(
+      address,
+      Allocation(size, sequence_number, &(callstack_allocations_it->second)));
 
   // Keep the sequence tracker consistent.
   RecordFree(kNoopFree, sequence_number);
@@ -111,48 +121,77 @@
 void HeapTracker::Dump(
     ProfilePacket::ProcessHeapSamples* proto,
     std::set<GlobalCallstackTrie::Node*>* callstacks_to_dump) {
-  for (const auto& p : allocations_) {
-    const Allocation& alloc = p.second;
+  // There are two reasons we remove the unused callstack allocations on the
+  // next iteration of Dump:
+  // * We need to remove them after the callstacks were dumped, which currently
+  //   happens after the allocations are dumped.
+  // * This way, we do not destroy and recreate callstacks as frequently.
+  for (auto it_and_alloc : dead_callstack_allocations_) {
+    auto& it = it_and_alloc.first;
+    uint64_t allocated = it_and_alloc.second;
+    const CallstackAllocations& alloc = it->second;
+    if (alloc.allocation_count == allocated && alloc.free_count == allocated)
+      callstack_allocations_.erase(it);
+  }
+  dead_callstack_allocations_.clear();
+
+  for (auto it = callstack_allocations_.begin();
+       it != callstack_allocations_.end(); ++it) {
+    const CallstackAllocations& alloc = it->second;
     callstacks_to_dump->emplace(alloc.node);
     ProfilePacket::HeapSample* sample = proto->add_samples();
     sample->set_callstack_id(alloc.node->id());
-    sample->set_cumulative_allocated(alloc.total_size);
+    sample->set_cumulative_allocated(alloc.allocated);
+    sample->set_cumulative_freed(alloc.freed);
+    sample->set_alloc_count(alloc.allocation_count);
+    sample->set_free_count(alloc.free_count);
+
+    if (alloc.allocation_count == alloc.free_count)
+      dead_callstack_allocations_.emplace_back(it, alloc.allocation_count);
   }
 }
 
-uint64_t GlobalCallstackTrie::GetCumSizeForTesting(
+uint64_t HeapTracker::GetSizeForTesting(
+    const std::vector<unwindstack::FrameData>& stack) {
+  GlobalCallstackTrie::Node* node = callsites_->CreateCallsite(stack);
+  // Hack to make it go away again if it wasn't used before.
+  // This is only good because this is used for testing only.
+  GlobalCallstackTrie::IncrementNode(node);
+  GlobalCallstackTrie::DecrementNode(node);
+  auto it = callstack_allocations_.find(node);
+  if (it == callstack_allocations_.end()) {
+    return 0;
+  }
+  const CallstackAllocations& alloc = it->second;
+  return alloc.allocated - alloc.freed;
+}
+
+GlobalCallstackTrie::Node* GlobalCallstackTrie::CreateCallsite(
     const std::vector<unwindstack::FrameData>& callstack) {
   Node* node = &root_;
   for (const unwindstack::FrameData& loc : callstack) {
-    node = node->children_.Get(InternCodeLocation(loc));
-    if (node == nullptr)
-      return 0;
-  }
-  return node->cum_size_;
-}
-
-GlobalCallstackTrie::Node* GlobalCallstackTrie::IncrementCallsite(
-    const std::vector<unwindstack::FrameData>& callstack,
-    uint64_t size) {
-  Node* node = &root_;
-  node->cum_size_ += size;
-  for (const unwindstack::FrameData& loc : callstack) {
     node = node->GetOrCreateChild(InternCodeLocation(loc));
-    node->cum_size_ += size;
   }
   return node;
 }
 
-void GlobalCallstackTrie::DecrementNode(Node* node, uint64_t size) {
-  PERFETTO_DCHECK(node->cum_size_ >= size);
+void GlobalCallstackTrie::IncrementNode(Node* node) {
+  while (node != nullptr) {
+    node->ref_count_ += 1;
+    node = node->parent_;
+  }
+}
+
+void GlobalCallstackTrie::DecrementNode(Node* node) {
+  PERFETTO_DCHECK(node->ref_count_ >= 1);
 
   bool delete_prev = false;
   Node* prev = nullptr;
   while (node != nullptr) {
     if (delete_prev)
       node->children_.Remove(*prev);
-    node->cum_size_ -= size;
-    delete_prev = node->cum_size_ == 0;
+    node->ref_count_ -= 1;
+    delete_prev = node->ref_count_ == 0;
     prev = node;
     node = node->parent_;
   }
diff --git a/src/profiling/memory/bookkeeping.h b/src/profiling/memory/bookkeeping.h
index 755329b..0bd849a 100644
--- a/src/profiling/memory/bookkeeping.h
+++ b/src/profiling/memory/bookkeeping.h
@@ -29,6 +29,67 @@
 #include "src/profiling/memory/interner.h"
 #include "src/profiling/memory/queue_messages.h"
 
+// Below is an illustration of the bookkeeping system state where
+// PID 1 does the following allocations:
+// 0x123: 128 bytes at [bar main]
+// 0x234: 128 bytes at [bar main]
+// 0xf00: 512 bytes at [foo main]
+// PID 1 allocated but previously freed 1024 bytes at [bar main]
+//
+// PID 2 does the following allocations:
+// 0x345: 512 bytes at [foo main]
+// 0x456:  32 bytes at [foo main]
+// PID 2 allocated but already freed 1235 bytes at [foo main]
+// PID 2 allocated and freed 2048 bytes in main.
+//
+// +---------------------------------+   +-------------------+
+// | +---------+    HeapTracker PID 1|   | GlobalCallstackTri|
+// | |0x123 128+---+    +----------+ |   |           +---+   |
+// | |         |   +---->alloc:1280+----------------->bar|   |
+// | |0x234 128+---+    |free: 1024| |   |           +-^-+   |
+// | |         |        +----------+ |   |   +---+     ^     |
+// | |0xf00 512+---+                 | +----->foo|     |     |
+// | +--------+|   |    +----------+ | | |   +-^-+     |     |
+// |               +---->alloc: 512+---+ |     |       |     |
+// |                    |free:    0| | | |     +--+----+     |
+// |                    +----------+ | | |        |          |
+// |                                 | | |      +-+--+       |
+// +---------------------------------+ | |      |main|       |
+//                                     | |      +--+-+       |
+// +---------------------------------+ | |         ^         |
+// | +---------+    HeapTracker PID 2| | +-------------------+
+// | |0x345 512+---+    +----------+ | |           |
+// | |         |   +---->alloc:1779+---+           |
+// | |0x456  32+---+    |free: 1235| |             |
+// | +---------+        +----------+ |             |
+// |                                 |             |
+// |                    +----------+ |             |
+// |                    |alloc:2048+---------------+
+// |                    |free: 2048| |
+// |                    +----------+ |
+// |                                 |
+// +---------------------------------+
+//   Allocation    CallstackAllocations        Node
+//
+// The active allocations are on the leftmost side, modeled as the class
+// HeapTracker::Allocation.
+//
+// The total allocated and freed bytes per callsite are in the middle, modeled
+// as the HeapTracker::CallstackAllocations class.
+// Note that (1280 - 1024) = 256, so alloc - free is equal to the total of the
+// currently active allocations.
+// Note in PID 2 there is a CallstackAllocations with 2048 allocated and 2048
+// freed bytes. This is not currently referenced by any Allocations (as it
+// should, as 2048 - 2048 = 0, which would mean that the total size of the
+// allocations referencing it should be 0). This is because we haven't dumped
+// this state yet, so the CallstackAllocations will be kept around until the
+// next dump, written to the trace, and then destroyed.
+//
+// On the right hand side is the GlobalCallstackTrie, with nodes representing
+// distinct callstacks. They have no information about the currently allocated
+// or freed bytes, they only contain a reference count to destroy them as
+// soon as they are no longer referenced by a HeapTracker.
+
 namespace perfetto {
 namespace profiling {
 
@@ -103,7 +164,7 @@
    private:
     Node* GetOrCreateChild(const Interner<Frame>::Interned& loc);
 
-    uint64_t cum_size_ = 0;
+    uint64_t ref_count_ = 0;
     Node* const parent_;
     const Interner<Frame>::Interned location_;
     base::LookupSet<Node, const Interner<Frame>::Interned, &Node::location_>
@@ -114,11 +175,9 @@
   GlobalCallstackTrie(const GlobalCallstackTrie&) = delete;
   GlobalCallstackTrie& operator=(const GlobalCallstackTrie&) = delete;
 
-  uint64_t GetCumSizeForTesting(
-      const std::vector<unwindstack::FrameData>& stack);
-  Node* IncrementCallsite(const std::vector<unwindstack::FrameData>& locs,
-                          uint64_t size);
-  static void DecrementNode(Node* node, uint64_t size);
+  Node* CreateCallsite(const std::vector<unwindstack::FrameData>& locs);
+  static void DecrementNode(Node* node);
+  static void IncrementNode(Node* node);
 
  private:
   Interner<Frame>::Interned InternCodeLocation(
@@ -161,29 +220,57 @@
   void Dump(protos::pbzero::ProfilePacket::ProcessHeapSamples* proto,
             std::set<GlobalCallstackTrie::Node*>* callstacks_to_dump);
 
+  uint64_t GetSizeForTesting(const std::vector<unwindstack::FrameData>& stack);
+
  private:
   static constexpr uint64_t kNoopFree = 0;
+
+  struct CallstackAllocations {
+    CallstackAllocations(GlobalCallstackTrie::Node* n) : node(n) {}
+
+    uint64_t allocated = 0;
+    uint64_t freed = 0;
+    uint64_t allocation_count = 0;
+    uint64_t free_count = 0;
+
+    GlobalCallstackTrie::Node* node;
+
+    ~CallstackAllocations() {
+      if (node)
+        GlobalCallstackTrie::DecrementNode(node);
+    }
+
+    bool operator<(const CallstackAllocations& other) const {
+      return node < other.node;
+    }
+  };
+
   struct Allocation {
-    Allocation(uint64_t size, uint64_t seq, GlobalCallstackTrie::Node* n)
-        : total_size(size), sequence_number(seq), node(n) {}
+    Allocation(uint64_t size, uint64_t seq, CallstackAllocations* csa)
+        : total_size(size), sequence_number(seq), callstack_allocations(csa) {
+      callstack_allocations->allocation_count++;
+      callstack_allocations->allocated += total_size;
+    }
 
     Allocation() = default;
     Allocation(const Allocation&) = delete;
     Allocation(Allocation&& other) noexcept {
       total_size = other.total_size;
       sequence_number = other.sequence_number;
-      node = other.node;
-      other.node = nullptr;
+      callstack_allocations = other.callstack_allocations;
+      other.callstack_allocations = nullptr;
     }
 
     ~Allocation() {
-      if (node)
-        GlobalCallstackTrie::DecrementNode(node, total_size);
+      if (callstack_allocations) {
+        callstack_allocations->free_count++;
+        callstack_allocations->freed += total_size;
+      }
     }
 
     uint64_t total_size;
     uint64_t sequence_number;
-    GlobalCallstackTrie::Node* node;
+    CallstackAllocations* callstack_allocations;
   };
 
   // Sequencing logic works as following:
@@ -209,6 +296,15 @@
   // commited to |allocations_|.
   void CommitFree(uint64_t sequence_number, uint64_t address);
 
+  // We cannot use an interner here, because after the last allocation goes
+  // away, we still need to keep the CallstackAllocations around until the next
+  // dump.
+  std::map<GlobalCallstackTrie::Node*, CallstackAllocations>
+      callstack_allocations_;
+
+  std::vector<std::pair<decltype(callstack_allocations_)::iterator, uint64_t>>
+      dead_callstack_allocations_;
+
   // Address -> (size, sequence_number, code location)
   std::map<uint64_t, Allocation> allocations_;
 
diff --git a/src/profiling/memory/bookkeeping_unittest.cc b/src/profiling/memory/bookkeeping_unittest.cc
index 8049ae6..fe545d1 100644
--- a/src/profiling/memory/bookkeeping_unittest.cc
+++ b/src/profiling/memory/bookkeeping_unittest.cc
@@ -49,15 +49,6 @@
   return res;
 }
 
-std::vector<unwindstack::FrameData> topframe() {
-  std::vector<unwindstack::FrameData> res;
-  unwindstack::FrameData data{};
-  data.function_name = "fun1";
-  data.map_name = "map1";
-  res.emplace_back(std::move(data));
-  return res;
-}
-
 TEST(BookkeepingTest, Basic) {
   uint64_t sequence_number = 1;
   GlobalCallstackTrie c;
@@ -65,11 +56,14 @@
 
   hd.RecordMalloc(stack(), 1, 5, sequence_number++);
   hd.RecordMalloc(stack2(), 2, 2, sequence_number++);
-  ASSERT_EQ(c.GetCumSizeForTesting(topframe()), 7);
+  ASSERT_EQ(hd.GetSizeForTesting(stack()), 5);
+  ASSERT_EQ(hd.GetSizeForTesting(stack2()), 2);
   hd.RecordFree(2, sequence_number++);
-  ASSERT_EQ(c.GetCumSizeForTesting(topframe()), 5);
+  ASSERT_EQ(hd.GetSizeForTesting(stack()), 5);
+  ASSERT_EQ(hd.GetSizeForTesting(stack2()), 0);
   hd.RecordFree(1, sequence_number++);
-  ASSERT_EQ(c.GetCumSizeForTesting(topframe()), 0);
+  ASSERT_EQ(hd.GetSizeForTesting(stack()), 0);
+  ASSERT_EQ(hd.GetSizeForTesting(stack2()), 0);
 }
 
 TEST(BookkeepingTest, TwoHeapTrackers) {
@@ -80,10 +74,11 @@
     HeapTracker hd2(&c);
 
     hd.RecordMalloc(stack(), 1, 5, sequence_number++);
-    hd2.RecordMalloc(stack2(), 2, 2, sequence_number++);
-    ASSERT_EQ(c.GetCumSizeForTesting(topframe()), 7);
+    hd2.RecordMalloc(stack(), 2, 2, sequence_number++);
+    ASSERT_EQ(hd2.GetSizeForTesting(stack()), 2);
+    ASSERT_EQ(hd.GetSizeForTesting(stack()), 5);
   }
-  ASSERT_EQ(c.GetCumSizeForTesting(topframe()), 5);
+  ASSERT_EQ(hd.GetSizeForTesting(stack()), 5);
 }
 
 TEST(BookkeepingTest, ReplaceAlloc) {
@@ -93,8 +88,8 @@
 
   hd.RecordMalloc(stack(), 1, 5, sequence_number++);
   hd.RecordMalloc(stack2(), 1, 2, sequence_number++);
-  EXPECT_EQ(c.GetCumSizeForTesting(stack()), 0);
-  EXPECT_EQ(c.GetCumSizeForTesting(stack2()), 2);
+  EXPECT_EQ(hd.GetSizeForTesting(stack()), 0);
+  EXPECT_EQ(hd.GetSizeForTesting(stack2()), 2);
 }
 
 TEST(BookkeepingTest, OutOfOrder) {
@@ -103,8 +98,8 @@
 
   hd.RecordMalloc(stack(), 1, 5, 1);
   hd.RecordMalloc(stack2(), 1, 2, 0);
-  EXPECT_EQ(c.GetCumSizeForTesting(stack()), 5);
-  EXPECT_EQ(c.GetCumSizeForTesting(stack2()), 0);
+  EXPECT_EQ(hd.GetSizeForTesting(stack()), 5);
+  EXPECT_EQ(hd.GetSizeForTesting(stack2()), 0);
 }
 
 }  // namespace