Add allocation stack traces for HPROF dump.

This feature is currently only enabled when DDMS's allocation tracking
is enabled. In the future there should be a way to enable this feature
before an application starts.

Also updates DDMS's recent allocation tracking to use a new backend
data structure that is shared with this feature.

The following system properties controls customizable parameters:
dalvik.vm.allocTrackerMax: max number of objects that have allocation
                           records, default 512K;

dalvik.vm.recentAllocMax:  max number of records that are sent to DDMS
                           when clicking "Get allocation" button,
                           default 64K-1 (limit of the protocol);

dalvik.vm.allocStackDepth: max number of stack frames in an allocation
                           record, default 4.

Bug: 20037135
Change-Id: I26ed378a5613678bd3c43e846025f90470a8e059
diff --git a/runtime/Android.mk b/runtime/Android.mk
index c1e6e09..5ed6955 100644
--- a/runtime/Android.mk
+++ b/runtime/Android.mk
@@ -45,6 +45,7 @@
   dex_file_verifier.cc \
   dex_instruction.cc \
   elf_file.cc \
+  gc/allocation_record.cc \
   gc/allocator/dlmalloc.cc \
   gc/allocator/rosalloc.cc \
   gc/accounting/bitmap.cc \
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index 24615e2..3c75daf 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -29,6 +29,7 @@
 #include "dex_file-inl.h"
 #include "dex_instruction.h"
 #include "gc/accounting/card_table-inl.h"
+#include "gc/allocation_record.h"
 #include "gc/space/large_object_space.h"
 #include "gc/space/space-inl.h"
 #include "handle_scope.h"
@@ -61,127 +62,30 @@
 // The key identifying the debugger to update instrumentation.
 static constexpr const char* kDbgInstrumentationKey = "Debugger";
 
-static const size_t kMaxAllocRecordStackDepth = 16;  // Max 255.
-static const size_t kDefaultNumAllocRecords = 64*1024;  // Must be a power of 2. 2BE can hold 64k-1.
-
-// Limit alloc_record_count to the 2BE value that is the limit of the current protocol.
+// Limit alloc_record_count to the 2BE value (64k-1) that is the limit of the current protocol.
 static uint16_t CappedAllocRecordCount(size_t alloc_record_count) {
-  if (alloc_record_count > 0xffff) {
-    return 0xffff;
+  size_t cap = 0xffff;
+#ifdef HAVE_ANDROID_OS
+  // Check whether there's a system property overriding the number of recent records.
+  const char* propertyName = "dalvik.vm.recentAllocMax";
+  char recentAllocMaxString[PROPERTY_VALUE_MAX];
+  if (property_get(propertyName, recentAllocMaxString, "") > 0) {
+    char* end;
+    size_t value = strtoul(recentAllocMaxString, &end, 10);
+    if (*end != '\0') {
+      LOG(ERROR) << "Ignoring  " << propertyName << " '" << recentAllocMaxString
+                 << "' --- invalid";
+    } else {
+      cap = value;
+    }
+  }
+#endif
+  if (alloc_record_count > cap) {
+    return cap;
   }
   return alloc_record_count;
 }
 
-class AllocRecordStackTraceElement {
- public:
-  AllocRecordStackTraceElement() : method_(nullptr), dex_pc_(0) {
-  }
-
-  int32_t LineNumber() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    ArtMethod* method = Method();
-    DCHECK(method != nullptr);
-    return method->GetLineNumFromDexPC(DexPc());
-  }
-
-  ArtMethod* Method() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    ScopedObjectAccessUnchecked soa(Thread::Current());
-    return soa.DecodeMethod(method_);
-  }
-
-  void SetMethod(ArtMethod* m) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    ScopedObjectAccessUnchecked soa(Thread::Current());
-    method_ = soa.EncodeMethod(m);
-  }
-
-  uint32_t DexPc() const {
-    return dex_pc_;
-  }
-
-  void SetDexPc(uint32_t pc) {
-    dex_pc_ = pc;
-  }
-
- private:
-  jmethodID method_;
-  uint32_t dex_pc_;
-};
-
-jobject Dbg::TypeCache::Add(mirror::Class* t) {
-  ScopedObjectAccessUnchecked soa(Thread::Current());
-  JNIEnv* const env = soa.Env();
-  ScopedLocalRef<jobject> local_ref(soa.Env(), soa.AddLocalReference<jobject>(t));
-  const int32_t hash_code = soa.Decode<mirror::Class*>(local_ref.get())->IdentityHashCode();
-  auto range = objects_.equal_range(hash_code);
-  for (auto it = range.first; it != range.second; ++it) {
-    if (soa.Decode<mirror::Class*>(it->second) == soa.Decode<mirror::Class*>(local_ref.get())) {
-      // Found a matching weak global, return it.
-      return it->second;
-    }
-  }
-  const jobject weak_global = env->NewWeakGlobalRef(local_ref.get());
-  objects_.insert(std::make_pair(hash_code, weak_global));
-  return weak_global;
-}
-
-void Dbg::TypeCache::Clear() {
-  JavaVMExt* vm = Runtime::Current()->GetJavaVM();
-  Thread* self = Thread::Current();
-  for (const auto& p : objects_) {
-    vm->DeleteWeakGlobalRef(self, p.second);
-  }
-  objects_.clear();
-}
-
-class AllocRecord {
- public:
-  AllocRecord() : type_(nullptr), byte_count_(0), thin_lock_id_(0) {}
-
-  mirror::Class* Type() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    return down_cast<mirror::Class*>(Thread::Current()->DecodeJObject(type_));
-  }
-
-  void SetType(mirror::Class* t) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_,
-                                                       Locks::alloc_tracker_lock_) {
-    type_ = Dbg::type_cache_.Add(t);
-  }
-
-  size_t GetDepth() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    size_t depth = 0;
-    while (depth < kMaxAllocRecordStackDepth && stack_[depth].Method() != nullptr) {
-      ++depth;
-    }
-    return depth;
-  }
-
-  size_t ByteCount() const {
-    return byte_count_;
-  }
-
-  void SetByteCount(size_t count) {
-    byte_count_ = count;
-  }
-
-  uint16_t ThinLockId() const {
-    return thin_lock_id_;
-  }
-
-  void SetThinLockId(uint16_t id) {
-    thin_lock_id_ = id;
-  }
-
-  AllocRecordStackTraceElement* StackElement(size_t index) {
-    DCHECK_LT(index, kMaxAllocRecordStackDepth);
-    return &stack_[index];
-  }
-
- private:
-  jobject type_;  // This is a weak global.
-  size_t byte_count_;
-  uint16_t thin_lock_id_;
-  // Unused entries have null method.
-  AllocRecordStackTraceElement stack_[kMaxAllocRecordStackDepth];
-};
-
 class Breakpoint {
  public:
   Breakpoint(ArtMethod* method, uint32_t dex_pc,
@@ -382,13 +286,6 @@
 bool Dbg::gDisposed = false;
 ObjectRegistry* Dbg::gRegistry = nullptr;
 
-// Recent allocation tracking.
-AllocRecord* Dbg::recent_allocation_records_ = nullptr;  // TODO: CircularBuffer<AllocRecord>
-size_t Dbg::alloc_record_max_ = 0;
-size_t Dbg::alloc_record_head_ = 0;
-size_t Dbg::alloc_record_count_ = 0;
-Dbg::TypeCache Dbg::type_cache_;
-
 // Deoptimization support.
 std::vector<DeoptimizationRequest> Dbg::deoptimization_requests_;
 size_t Dbg::full_deoptimization_event_count_ = 0;
@@ -4665,177 +4562,41 @@
   Dbg::DdmSendChunk(native ? CHUNK_TYPE("NHEN") : CHUNK_TYPE("HPEN"), sizeof(heap_id), heap_id);
 }
 
-static size_t GetAllocTrackerMax() {
-#ifdef HAVE_ANDROID_OS
-  // Check whether there's a system property overriding the number of records.
-  const char* propertyName = "dalvik.vm.allocTrackerMax";
-  char allocRecordMaxString[PROPERTY_VALUE_MAX];
-  if (property_get(propertyName, allocRecordMaxString, "") > 0) {
-    char* end;
-    size_t value = strtoul(allocRecordMaxString, &end, 10);
-    if (*end != '\0') {
-      LOG(ERROR) << "Ignoring  " << propertyName << " '" << allocRecordMaxString
-                 << "' --- invalid";
-      return kDefaultNumAllocRecords;
-    }
-    if (!IsPowerOfTwo(value)) {
-      LOG(ERROR) << "Ignoring  " << propertyName << " '" << allocRecordMaxString
-                 << "' --- not power of two";
-      return kDefaultNumAllocRecords;
-    }
-    return value;
-  }
-#endif
-  return kDefaultNumAllocRecords;
-}
-
 void Dbg::SetAllocTrackingEnabled(bool enable) {
-  Thread* self = Thread::Current();
-  if (enable) {
-    {
-      MutexLock mu(self, *Locks::alloc_tracker_lock_);
-      if (recent_allocation_records_ != nullptr) {
-        return;  // Already enabled, bail.
-      }
-      alloc_record_max_ = GetAllocTrackerMax();
-      LOG(INFO) << "Enabling alloc tracker (" << alloc_record_max_ << " entries of "
-                << kMaxAllocRecordStackDepth << " frames, taking "
-                << PrettySize(sizeof(AllocRecord) * alloc_record_max_) << ")";
-      DCHECK_EQ(alloc_record_head_, 0U);
-      DCHECK_EQ(alloc_record_count_, 0U);
-      recent_allocation_records_ = new AllocRecord[alloc_record_max_];
-      CHECK(recent_allocation_records_ != nullptr);
-    }
-    Runtime::Current()->GetInstrumentation()->InstrumentQuickAllocEntryPoints();
-  } else {
-    {
-      ScopedObjectAccess soa(self);  // For type_cache_.Clear();
-      MutexLock mu(self, *Locks::alloc_tracker_lock_);
-      if (recent_allocation_records_ == nullptr) {
-        return;  // Already disabled, bail.
-      }
-      LOG(INFO) << "Disabling alloc tracker";
-      delete[] recent_allocation_records_;
-      recent_allocation_records_ = nullptr;
-      alloc_record_head_ = 0;
-      alloc_record_count_ = 0;
-      type_cache_.Clear();
-    }
-    // If an allocation comes in before we uninstrument, we will safely drop it on the floor.
-    Runtime::Current()->GetInstrumentation()->UninstrumentQuickAllocEntryPoints();
-  }
-}
-
-struct AllocRecordStackVisitor : public StackVisitor {
-  AllocRecordStackVisitor(Thread* thread, AllocRecord* record_in)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
-      : StackVisitor(thread, nullptr, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
-        record(record_in),
-        depth(0) {}
-
-  // TODO: Enable annotalysis. We know lock is held in constructor, but abstraction confuses
-  // annotalysis.
-  bool VisitFrame() NO_THREAD_SAFETY_ANALYSIS {
-    if (depth >= kMaxAllocRecordStackDepth) {
-      return false;
-    }
-    ArtMethod* m = GetMethod();
-    if (!m->IsRuntimeMethod()) {
-      record->StackElement(depth)->SetMethod(m);
-      record->StackElement(depth)->SetDexPc(GetDexPc());
-      ++depth;
-    }
-    return true;
-  }
-
-  ~AllocRecordStackVisitor() {
-    // Clear out any unused stack trace elements.
-    for (; depth < kMaxAllocRecordStackDepth; ++depth) {
-      record->StackElement(depth)->SetMethod(nullptr);
-      record->StackElement(depth)->SetDexPc(0);
-    }
-  }
-
-  AllocRecord* record;
-  size_t depth;
-};
-
-void Dbg::RecordAllocation(Thread* self, mirror::Class* type, size_t byte_count) {
-  MutexLock mu(self, *Locks::alloc_tracker_lock_);
-  if (recent_allocation_records_ == nullptr) {
-    // In the process of shutting down recording, bail.
-    return;
-  }
-
-  // Advance and clip.
-  if (++alloc_record_head_ == alloc_record_max_) {
-    alloc_record_head_ = 0;
-  }
-
-  // Fill in the basics.
-  AllocRecord* record = &recent_allocation_records_[alloc_record_head_];
-  record->SetType(type);
-  record->SetByteCount(byte_count);
-  record->SetThinLockId(self->GetThreadId());
-
-  // Fill in the stack trace.
-  AllocRecordStackVisitor visitor(self, record);
-  visitor.WalkStack();
-
-  if (alloc_record_count_ < alloc_record_max_) {
-    ++alloc_record_count_;
-  }
-}
-
-// Returns the index of the head element.
-//
-// We point at the most-recently-written record, so if alloc_record_count_ is 1
-// we want to use the current element.  Take "head+1" and subtract count
-// from it.
-//
-// We need to handle underflow in our circular buffer, so we add
-// alloc_record_max_ and then mask it back down.
-size_t Dbg::HeadIndex() {
-  return (Dbg::alloc_record_head_ + 1 + Dbg::alloc_record_max_ - Dbg::alloc_record_count_) &
-      (Dbg::alloc_record_max_ - 1);
+  gc::AllocRecordObjectMap::SetAllocTrackingEnabled(enable);
 }
 
 void Dbg::DumpRecentAllocations() {
   ScopedObjectAccess soa(Thread::Current());
   MutexLock mu(soa.Self(), *Locks::alloc_tracker_lock_);
-  if (recent_allocation_records_ == nullptr) {
+  if (!Runtime::Current()->GetHeap()->IsAllocTrackingEnabled()) {
     LOG(INFO) << "Not recording tracked allocations";
     return;
   }
+  gc::AllocRecordObjectMap* records = Runtime::Current()->GetHeap()->GetAllocationRecords();
+  CHECK(records != nullptr);
 
-  // "i" is the head of the list.  We want to start at the end of the
-  // list and move forward to the tail.
-  size_t i = HeadIndex();
-  const uint16_t capped_count = CappedAllocRecordCount(Dbg::alloc_record_count_);
+  const uint16_t capped_count = CappedAllocRecordCount(records->Size());
   uint16_t count = capped_count;
 
-  LOG(INFO) << "Tracked allocations, (head=" << alloc_record_head_ << " count=" << count << ")";
-  while (count--) {
-    AllocRecord* record = &recent_allocation_records_[i];
+  LOG(INFO) << "Tracked allocations, (count=" << count << ")";
+  for (auto it = records->RBegin(), end = records->REnd();
+      count > 0 && it != end; count--, it++) {
+    const gc::AllocRecord* record = it->second;
 
-    LOG(INFO) << StringPrintf(" Thread %-2d %6zd bytes ", record->ThinLockId(), record->ByteCount())
-              << PrettyClass(record->Type());
+    LOG(INFO) << StringPrintf(" Thread %-2d %6zd bytes ", record->GetTid(), record->ByteCount())
+              << PrettyClass(it->first.Read()->GetClass());
 
-    for (size_t stack_frame = 0; stack_frame < kMaxAllocRecordStackDepth; ++stack_frame) {
-      AllocRecordStackTraceElement* stack_element = record->StackElement(stack_frame);
-      ArtMethod* m = stack_element->Method();
-      if (m == nullptr) {
-        break;
-      }
-      LOG(INFO) << "    " << PrettyMethod(m) << " line " << stack_element->LineNumber();
+    for (size_t stack_frame = 0, depth = record->GetDepth(); stack_frame < depth; ++stack_frame) {
+      const gc::AllocRecordStackTraceElement& stack_element = record->StackElement(stack_frame);
+      ArtMethod* m = stack_element.GetMethod();
+      LOG(INFO) << "    " << PrettyMethod(m) << " line " << stack_element.ComputeLineNumber();
     }
 
     // pause periodically to help logcat catch up
     if ((count % 5) == 0) {
       usleep(40000);
     }
-
-    i = (i + 1) & (alloc_record_max_ - 1);
   }
 }
 
@@ -4937,6 +4698,15 @@
   std::vector<uint8_t> bytes;
   {
     MutexLock mu(self, *Locks::alloc_tracker_lock_);
+    gc::AllocRecordObjectMap* records = Runtime::Current()->GetHeap()->GetAllocationRecords();
+    // In case this method is called when allocation tracker is disabled,
+    // we should still send some data back.
+    gc::AllocRecordObjectMap dummy;
+    if (records == nullptr) {
+      CHECK(!Runtime::Current()->GetHeap()->IsAllocTrackingEnabled());
+      records = &dummy;
+    }
+
     //
     // Part 1: generate string tables.
     //
@@ -4944,26 +4714,23 @@
     StringTable method_names;
     StringTable filenames;
 
-    const uint16_t capped_count = CappedAllocRecordCount(Dbg::alloc_record_count_);
+    const uint16_t capped_count = CappedAllocRecordCount(records->Size());
     uint16_t count = capped_count;
-    size_t idx = HeadIndex();
-    while (count--) {
-      AllocRecord* record = &recent_allocation_records_[idx];
+    for (auto it = records->RBegin(), end = records->REnd();
+         count > 0 && it != end; count--, it++) {
+      const gc::AllocRecord* record = it->second;
       std::string temp;
-      class_names.Add(record->Type()->GetDescriptor(&temp));
-      for (size_t i = 0; i < kMaxAllocRecordStackDepth; i++) {
-        ArtMethod* m = record->StackElement(i)->Method();
-        if (m != nullptr) {
-          class_names.Add(m->GetDeclaringClassDescriptor());
-          method_names.Add(m->GetName());
-          filenames.Add(GetMethodSourceFile(m));
-        }
+      class_names.Add(it->first.Read()->GetClass()->GetDescriptor(&temp));
+      for (size_t i = 0, depth = record->GetDepth(); i < depth; i++) {
+        ArtMethod* m = record->StackElement(i).GetMethod();
+        class_names.Add(m->GetDeclaringClassDescriptor());
+        method_names.Add(m->GetName());
+        filenames.Add(GetMethodSourceFile(m));
       }
-
-      idx = (idx + 1) & (alloc_record_max_ - 1);
     }
 
-    LOG(INFO) << "allocation records: " << capped_count;
+    LOG(INFO) << "recent allocation records: " << capped_count;
+    LOG(INFO) << "allocation records all objects: " << records->Size();
 
     //
     // Part 2: Generate the output and store it in the buffer.
@@ -4991,20 +4758,23 @@
     JDWP::Append2BE(bytes, method_names.Size());
     JDWP::Append2BE(bytes, filenames.Size());
 
-    idx = HeadIndex();
     std::string temp;
-    for (count = capped_count; count != 0; --count) {
+    count = capped_count;
+    // The last "count" number of allocation records in "records" are the most recent "count" number
+    // of allocations. Reverse iterate to get them. The most recent allocation is sent first.
+    for (auto it = records->RBegin(), end = records->REnd();
+         count > 0 && it != end; count--, it++) {
       // For each entry:
       // (4b) total allocation size
       // (2b) thread id
       // (2b) allocated object's class name index
       // (1b) stack depth
-      AllocRecord* record = &recent_allocation_records_[idx];
+      const gc::AllocRecord* record = it->second;
       size_t stack_depth = record->GetDepth();
       size_t allocated_object_class_name_index =
-          class_names.IndexOf(record->Type()->GetDescriptor(&temp));
+          class_names.IndexOf(it->first.Read()->GetClass()->GetDescriptor(&temp));
       JDWP::Append4BE(bytes, record->ByteCount());
-      JDWP::Append2BE(bytes, record->ThinLockId());
+      JDWP::Append2BE(bytes, static_cast<uint16_t>(record->GetTid()));
       JDWP::Append2BE(bytes, allocated_object_class_name_index);
       JDWP::Append1BE(bytes, stack_depth);
 
@@ -5014,16 +4784,15 @@
         // (2b) method name
         // (2b) method source file
         // (2b) line number, clipped to 32767; -2 if native; -1 if no source
-        ArtMethod* m = record->StackElement(stack_frame)->Method();
+        ArtMethod* m = record->StackElement(stack_frame).GetMethod();
         size_t class_name_index = class_names.IndexOf(m->GetDeclaringClassDescriptor());
         size_t method_name_index = method_names.IndexOf(m->GetName());
         size_t file_name_index = filenames.IndexOf(GetMethodSourceFile(m));
         JDWP::Append2BE(bytes, class_name_index);
         JDWP::Append2BE(bytes, method_name_index);
         JDWP::Append2BE(bytes, file_name_index);
-        JDWP::Append2BE(bytes, record->StackElement(stack_frame)->LineNumber());
+        JDWP::Append2BE(bytes, record->StackElement(stack_frame).ComputeLineNumber());
       }
-      idx = (idx + 1) & (alloc_record_max_ - 1);
     }
 
     // (xb) class name strings
diff --git a/runtime/debugger.h b/runtime/debugger.h
index 7c586a4..c23b1b8 100644
--- a/runtime/debugger.h
+++ b/runtime/debugger.h
@@ -23,6 +23,7 @@
 
 #include <pthread.h>
 
+#include <list>
 #include <map>
 #include <set>
 #include <string>
@@ -32,7 +33,6 @@
 #include "jdwp/jdwp.h"
 #include "jni.h"
 #include "jvalue.h"
-#include "object_callbacks.h"
 #include "thread_state.h"
 
 namespace art {
@@ -41,7 +41,6 @@
 class Object;
 class Throwable;
 }  // namespace mirror
-class AllocRecord;
 class ArtField;
 class ArtMethod;
 class ObjectRegistry;
@@ -202,19 +201,6 @@
 
 class Dbg {
  public:
-  class TypeCache {
-   public:
-    // Returns a weak global for the input type. Deduplicates.
-    jobject Add(mirror::Class* t) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_,
-                                                        Locks::alloc_tracker_lock_);
-    // Clears the type cache and deletes all the weak global refs.
-    void Clear() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_,
-                                       Locks::alloc_tracker_lock_);
-
-   private:
-    std::multimap<int32_t, jobject> objects_;
-  };
-
   static void SetJdwpAllowed(bool allowed);
 
   static void StartJdwp();
@@ -655,19 +641,12 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   /*
-   * Recent allocation tracking support.
+   * Allocation tracking support.
    */
-  static void RecordAllocation(Thread* self, mirror::Class* type, size_t byte_count)
-      LOCKS_EXCLUDED(Locks::alloc_tracker_lock_)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
   static void SetAllocTrackingEnabled(bool enabled) LOCKS_EXCLUDED(Locks::alloc_tracker_lock_);
-  static bool IsAllocTrackingEnabled() {
-    return recent_allocation_records_ != nullptr;
-  }
   static jbyteArray GetRecentAllocations()
       LOCKS_EXCLUDED(Locks::alloc_tracker_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static size_t HeadIndex() EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_);
   static void DumpRecentAllocations() LOCKS_EXCLUDED(Locks::alloc_tracker_lock_);
 
   enum HpifWhen {
@@ -755,11 +734,6 @@
   static bool IsForcedInterpreterNeededForUpcallImpl(Thread* thread, ArtMethod* m)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  static AllocRecord* recent_allocation_records_ PT_GUARDED_BY(Locks::alloc_tracker_lock_);
-  static size_t alloc_record_max_ GUARDED_BY(Locks::alloc_tracker_lock_);
-  static size_t alloc_record_head_ GUARDED_BY(Locks::alloc_tracker_lock_);
-  static size_t alloc_record_count_ GUARDED_BY(Locks::alloc_tracker_lock_);
-
   // Indicates whether the debugger is making requests.
   static bool gDebuggerActive;
 
@@ -784,9 +758,6 @@
 
   static size_t* GetReferenceCounterForEvent(uint32_t instrumentation_event);
 
-  // Weak global type cache, TODO improve this.
-  static TypeCache type_cache_ GUARDED_BY(Locks::alloc_tracker_lock_);
-
   // Instrumentation event reference counters.
   // TODO we could use an array instead of having all these dedicated counters. Instrumentation
   // events are bits of a mask so we could convert them to array index.
@@ -798,7 +769,6 @@
   static size_t exception_catch_event_ref_count_ GUARDED_BY(Locks::deoptimization_lock_);
   static uint32_t instrumentation_events_ GUARDED_BY(Locks::mutator_lock_);
 
-  friend class AllocRecord;  // For type_cache_ with proper annotalysis.
   DISALLOW_COPY_AND_ASSIGN(Dbg);
 };
 
diff --git a/runtime/gc/allocation_record.cc b/runtime/gc/allocation_record.cc
new file mode 100644
index 0000000..a385363
--- /dev/null
+++ b/runtime/gc/allocation_record.cc
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "allocation_record.h"
+
+#include "art_method-inl.h"
+#include "base/stl_util.h"
+#include "stack.h"
+
+#ifdef HAVE_ANDROID_OS
+#include "cutils/properties.h"
+#endif
+
+namespace art {
+namespace gc {
+
+int32_t AllocRecordStackTraceElement::ComputeLineNumber() const {
+  DCHECK(method_ != nullptr);
+  return method_->GetLineNumFromDexPC(dex_pc_);
+}
+
+void AllocRecordObjectMap::SetProperties() {
+#ifdef HAVE_ANDROID_OS
+  // Check whether there's a system property overriding the max number of records.
+  const char* propertyName = "dalvik.vm.allocTrackerMax";
+  char allocMaxString[PROPERTY_VALUE_MAX];
+  if (property_get(propertyName, allocMaxString, "") > 0) {
+    char* end;
+    size_t value = strtoul(allocMaxString, &end, 10);
+    if (*end != '\0') {
+      LOG(ERROR) << "Ignoring  " << propertyName << " '" << allocMaxString
+                 << "' --- invalid";
+    } else {
+      alloc_record_max_ = value;
+    }
+  }
+  // Check whether there's a system property overriding the max depth of stack trace.
+  propertyName = "dalvik.vm.allocStackDepth";
+  char stackDepthString[PROPERTY_VALUE_MAX];
+  if (property_get(propertyName, stackDepthString, "") > 0) {
+    char* end;
+    size_t value = strtoul(stackDepthString, &end, 10);
+    if (*end != '\0') {
+      LOG(ERROR) << "Ignoring  " << propertyName << " '" << stackDepthString
+                 << "' --- invalid";
+    } else {
+      max_stack_depth_ = value;
+    }
+  }
+#endif
+}
+
+AllocRecordObjectMap::~AllocRecordObjectMap() {
+  STLDeleteValues(&entries_);
+}
+
+void AllocRecordObjectMap::SweepAllocationRecords(IsMarkedCallback* callback, void* arg) {
+  VLOG(heap) << "Start SweepAllocationRecords()";
+  size_t count_deleted = 0, count_moved = 0;
+  for (auto it = entries_.begin(), end = entries_.end(); it != end;) {
+    // This does not need a read barrier because this is called by GC.
+    mirror::Object* old_object = it->first.Read<kWithoutReadBarrier>();
+    AllocRecord* record = it->second;
+    mirror::Object* new_object = callback(old_object, arg);
+    if (new_object == nullptr) {
+      delete record;
+      it = entries_.erase(it);
+      ++count_deleted;
+    } else {
+      if (old_object != new_object) {
+        it->first = GcRoot<mirror::Object>(new_object);
+        ++count_moved;
+      }
+      ++it;
+    }
+  }
+  VLOG(heap) << "Deleted " << count_deleted << " allocation records";
+  VLOG(heap) << "Updated " << count_moved << " allocation records";
+}
+
+struct AllocRecordStackVisitor : public StackVisitor {
+  AllocRecordStackVisitor(Thread* thread, AllocRecordStackTrace* trace_in, size_t max)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      : StackVisitor(thread, nullptr, StackVisitor::StackWalkKind::kIncludeInlinedFrames),
+        trace(trace_in),
+        depth(0),
+        max_depth(max) {}
+
+  // TODO: Enable annotalysis. We know lock is held in constructor, but abstraction confuses
+  // annotalysis.
+  bool VisitFrame() OVERRIDE NO_THREAD_SAFETY_ANALYSIS {
+    if (depth >= max_depth) {
+      return false;
+    }
+    ArtMethod* m = GetMethod();
+    if (!m->IsRuntimeMethod()) {
+      trace->SetStackElementAt(depth, m, GetDexPc());
+      ++depth;
+    }
+    return true;
+  }
+
+  ~AllocRecordStackVisitor() {
+    trace->SetDepth(depth);
+  }
+
+  AllocRecordStackTrace* trace;
+  size_t depth;
+  const size_t max_depth;
+};
+
+void AllocRecordObjectMap::SetAllocTrackingEnabled(bool enable) {
+  Thread* self = Thread::Current();
+  Heap* heap = Runtime::Current()->GetHeap();
+  if (enable) {
+    {
+      MutexLock mu(self, *Locks::alloc_tracker_lock_);
+      if (heap->IsAllocTrackingEnabled()) {
+        return;  // Already enabled, bail.
+      }
+      AllocRecordObjectMap* records = new AllocRecordObjectMap();
+      CHECK(records != nullptr);
+      records->SetProperties();
+      std::string self_name;
+      self->GetThreadName(self_name);
+      if (self_name == "JDWP") {
+        records->alloc_ddm_thread_id_ = self->GetTid();
+      }
+      size_t sz = sizeof(AllocRecordStackTraceElement) * records->max_stack_depth_ +
+                  sizeof(AllocRecord) + sizeof(AllocRecordStackTrace);
+      LOG(INFO) << "Enabling alloc tracker (" << records->alloc_record_max_ << " entries of "
+                << records->max_stack_depth_ << " frames, taking up to "
+                << PrettySize(sz * records->alloc_record_max_) << ")";
+      heap->SetAllocationRecords(records);
+      heap->SetAllocTrackingEnabled(true);
+    }
+    Runtime::Current()->GetInstrumentation()->InstrumentQuickAllocEntryPoints();
+  } else {
+    {
+      MutexLock mu(self, *Locks::alloc_tracker_lock_);
+      if (!heap->IsAllocTrackingEnabled()) {
+        return;  // Already disabled, bail.
+      }
+      heap->SetAllocTrackingEnabled(false);
+      LOG(INFO) << "Disabling alloc tracker";
+      heap->SetAllocationRecords(nullptr);
+    }
+    // If an allocation comes in before we uninstrument, we will safely drop it on the floor.
+    Runtime::Current()->GetInstrumentation()->UninstrumentQuickAllocEntryPoints();
+  }
+}
+
+void AllocRecordObjectMap::RecordAllocation(Thread* self, mirror::Object* obj, size_t byte_count) {
+  MutexLock mu(self, *Locks::alloc_tracker_lock_);
+  Heap* heap = Runtime::Current()->GetHeap();
+  if (!heap->IsAllocTrackingEnabled()) {
+    // In the process of shutting down recording, bail.
+    return;
+  }
+
+  AllocRecordObjectMap* records = heap->GetAllocationRecords();
+  DCHECK(records != nullptr);
+
+  // Do not record for DDM thread
+  if (records->alloc_ddm_thread_id_ == self->GetTid()) {
+    return;
+  }
+
+  DCHECK_LE(records->Size(), records->alloc_record_max_);
+
+  // Remove oldest record.
+  if (records->Size() == records->alloc_record_max_) {
+    records->RemoveOldest();
+  }
+
+  // Get stack trace.
+  const size_t max_depth = records->max_stack_depth_;
+  AllocRecordStackTrace* trace = new AllocRecordStackTrace(self->GetTid(), max_depth);
+  // add scope to make "visitor" destroyed promptly, in order to set the trace->depth_
+  {
+    AllocRecordStackVisitor visitor(self, trace, max_depth);
+    visitor.WalkStack();
+  }
+
+  // Fill in the basics.
+  AllocRecord* record = new AllocRecord(byte_count, trace);
+
+  records->Put(obj, record);
+  DCHECK_LE(records->Size(), records->alloc_record_max_);
+}
+
+}  // namespace gc
+}  // namespace art
diff --git a/runtime/gc/allocation_record.h b/runtime/gc/allocation_record.h
new file mode 100644
index 0000000..45b3406
--- /dev/null
+++ b/runtime/gc/allocation_record.h
@@ -0,0 +1,271 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_RUNTIME_GC_ALLOCATION_RECORD_H_
+#define ART_RUNTIME_GC_ALLOCATION_RECORD_H_
+
+#include <list>
+
+#include "base/mutex.h"
+#include "object_callbacks.h"
+#include "gc_root.h"
+
+namespace art {
+
+class ArtMethod;
+class Thread;
+
+namespace mirror {
+  class Class;
+  class Object;
+}
+
+namespace gc {
+
+class AllocRecordStackTraceElement {
+ public:
+  AllocRecordStackTraceElement() : method_(nullptr), dex_pc_(0) {}
+
+  int32_t ComputeLineNumber() const SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  ArtMethod* GetMethod() const {
+    return method_;
+  }
+
+  void SetMethod(ArtMethod* m) {
+    method_ = m;
+  }
+
+  uint32_t GetDexPc() const {
+    return dex_pc_;
+  }
+
+  void SetDexPc(uint32_t pc) {
+    dex_pc_ = pc;
+  }
+
+  bool operator==(const AllocRecordStackTraceElement& other) const {
+    if (this == &other) return true;
+    return method_ == other.method_ && dex_pc_ == other.dex_pc_;
+  }
+
+ private:
+  ArtMethod* method_;
+  uint32_t dex_pc_;
+};
+
+class AllocRecordStackTrace {
+ public:
+  static constexpr size_t kHashMultiplier = 17;
+
+  AllocRecordStackTrace(pid_t tid, size_t max_depth)
+      : tid_(tid), depth_(0), stack_(new AllocRecordStackTraceElement[max_depth]) {}
+
+  ~AllocRecordStackTrace() {
+    delete[] stack_;
+  }
+
+  pid_t GetTid() const {
+    return tid_;
+  }
+
+  size_t GetDepth() const {
+    return depth_;
+  }
+
+  void SetDepth(size_t depth) {
+    depth_ = depth;
+  }
+
+  const AllocRecordStackTraceElement& GetStackElement(size_t index) const {
+    DCHECK_LT(index, depth_);
+    return stack_[index];
+  }
+
+  void SetStackElementAt(size_t index, ArtMethod* m, uint32_t dex_pc) {
+    stack_[index].SetMethod(m);
+    stack_[index].SetDexPc(dex_pc);
+  }
+
+  bool operator==(const AllocRecordStackTrace& other) const {
+    if (this == &other) return true;
+    if (depth_ != other.depth_) return false;
+    for (size_t i = 0; i < depth_; ++i) {
+      if (!(stack_[i] == other.stack_[i])) return false;
+    }
+    return true;
+  }
+
+ private:
+  const pid_t tid_;
+  size_t depth_;
+  AllocRecordStackTraceElement* const stack_;
+};
+
+struct HashAllocRecordTypes {
+  size_t operator()(const AllocRecordStackTraceElement& r) const {
+    return std::hash<void*>()(reinterpret_cast<void*>(r.GetMethod())) *
+        AllocRecordStackTrace::kHashMultiplier + std::hash<uint32_t>()(r.GetDexPc());
+  }
+
+  size_t operator()(const AllocRecordStackTrace& r) const {
+    size_t depth = r.GetDepth();
+    size_t result = r.GetTid() * AllocRecordStackTrace::kHashMultiplier + depth;
+    for (size_t i = 0; i < depth; ++i) {
+      result = result * AllocRecordStackTrace::kHashMultiplier + (*this)(r.GetStackElement(i));
+    }
+    return result;
+  }
+};
+
+template <typename T> struct HashAllocRecordTypesPtr {
+  size_t operator()(const T* r) const {
+    if (r == nullptr) return 0;
+    return HashAllocRecordTypes()(*r);
+  }
+};
+
+template <typename T> struct EqAllocRecordTypesPtr {
+  bool operator()(const T* r1, const T* r2) const {
+    if (r1 == r2) return true;
+    if (r1 == nullptr || r2 == nullptr) return false;
+    return *r1 == *r2;
+  }
+};
+
+class AllocRecord {
+ public:
+  // All instances of AllocRecord should be managed by an instance of AllocRecordObjectMap.
+  AllocRecord(size_t count, AllocRecordStackTrace* trace)
+      : byte_count_(count), trace_(trace) {}
+
+  ~AllocRecord() {
+    delete trace_;
+  }
+
+  size_t GetDepth() const {
+    return trace_->GetDepth();
+  }
+
+  const AllocRecordStackTrace* GetStackTrace() const {
+    return trace_;
+  }
+
+  size_t ByteCount() const {
+    return byte_count_;
+  }
+
+  pid_t GetTid() const {
+    return trace_->GetTid();
+  }
+
+  const AllocRecordStackTraceElement& StackElement(size_t index) const {
+    return trace_->GetStackElement(index);
+  }
+
+ private:
+  const size_t byte_count_;
+  // TODO: Currently trace_ is like a std::unique_ptr,
+  // but in future with deduplication it could be a std::shared_ptr.
+  const AllocRecordStackTrace* const trace_;
+};
+
+class AllocRecordObjectMap {
+ public:
+  // Since the entries contain weak roots, they need a read barrier. Do not directly access
+  // the mirror::Object pointers in it. Use functions that contain read barriers.
+  // No need for "const AllocRecord*" in the list, because all fields of AllocRecord are const.
+  typedef std::list<std::pair<GcRoot<mirror::Object>, AllocRecord*>> EntryList;
+
+  // "static" because it is part of double-checked locking. It needs to check a bool first,
+  // in order to make sure the AllocRecordObjectMap object is not null.
+  static void RecordAllocation(Thread* self, mirror::Object* obj, size_t byte_count)
+      LOCKS_EXCLUDED(Locks::alloc_tracker_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
+  static void SetAllocTrackingEnabled(bool enabled) LOCKS_EXCLUDED(Locks::alloc_tracker_lock_);
+
+  AllocRecordObjectMap() EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_)
+      : alloc_record_max_(kDefaultNumAllocRecords),
+        max_stack_depth_(kDefaultAllocStackDepth),
+        alloc_ddm_thread_id_(0) {}
+
+  ~AllocRecordObjectMap();
+
+  void Put(mirror::Object* obj, AllocRecord* record)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    entries_.emplace_back(GcRoot<mirror::Object>(obj), record);
+  }
+
+  size_t Size() const SHARED_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return entries_.size();
+  }
+
+  void SweepAllocationRecords(IsMarkedCallback* callback, void* arg)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_);
+
+  void RemoveOldest()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    DCHECK(!entries_.empty());
+    delete entries_.front().second;
+    entries_.pop_front();
+  }
+
+  // TODO: Is there a better way to hide the entries_'s type?
+  EntryList::iterator Begin()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return entries_.begin();
+  }
+
+  EntryList::iterator End()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return entries_.end();
+  }
+
+  EntryList::reverse_iterator RBegin()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return entries_.rbegin();
+  }
+
+  EntryList::reverse_iterator REnd()
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return entries_.rend();
+  }
+
+ private:
+  static constexpr size_t kDefaultNumAllocRecords = 512 * 1024;
+  static constexpr size_t kDefaultAllocStackDepth = 4;
+  size_t alloc_record_max_ GUARDED_BY(Locks::alloc_tracker_lock_);
+  // The implementation always allocates max_stack_depth_ number of frames for each stack trace.
+  // As long as the max depth is not very large, this is not a waste of memory since most stack
+  // traces will fill up the max depth number of the frames.
+  size_t max_stack_depth_ GUARDED_BY(Locks::alloc_tracker_lock_);
+  pid_t alloc_ddm_thread_id_ GUARDED_BY(Locks::alloc_tracker_lock_);
+  EntryList entries_ GUARDED_BY(Locks::alloc_tracker_lock_);
+
+  void SetProperties() EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_);
+};
+
+}  // namespace gc
+}  // namespace art
+#endif  // ART_RUNTIME_GC_ALLOCATION_RECORD_H_
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 2d54330..ee4568e 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -22,6 +22,7 @@
 #include "base/time_utils.h"
 #include "debugger.h"
 #include "gc/accounting/card_table-inl.h"
+#include "gc/allocation_record.h"
 #include "gc/collector/semi_space.h"
 #include "gc/space/bump_pointer_space-inl.h"
 #include "gc/space/dlmalloc_space-inl.h"
@@ -168,11 +169,11 @@
     PushOnAllocationStack(self, &obj);
   }
   if (kInstrumented) {
-    if (Dbg::IsAllocTrackingEnabled()) {
-      Dbg::RecordAllocation(self, klass, bytes_allocated);
+    if (IsAllocTrackingEnabled()) {
+      AllocRecordObjectMap::RecordAllocation(self, obj, bytes_allocated);
     }
   } else {
-    DCHECK(!Dbg::IsAllocTrackingEnabled());
+    DCHECK(!IsAllocTrackingEnabled());
   }
   // IsConcurrentGc() isn't known at compile time so we can optimize by not checking it for
   // the BumpPointer or TLAB allocators. This is nice since it allows the entire if statement to be
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index aeab7d8..14fa740 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -209,7 +209,8 @@
       blocking_gc_count_last_window_(0U),
       gc_count_rate_histogram_("gc count rate histogram", 1U, kGcCountRateMaxBucketCount),
       blocking_gc_count_rate_histogram_("blocking gc count rate histogram", 1U,
-                                        kGcCountRateMaxBucketCount) {
+                                        kGcCountRateMaxBucketCount),
+      alloc_tracking_enabled_(false) {
   if (VLOG_IS_ON(heap) || VLOG_IS_ON(startup)) {
     LOG(INFO) << "Heap() entering";
   }
@@ -1043,6 +1044,7 @@
   STLDeleteElements(&garbage_collectors_);
   // If we don't reset then the mark stack complains in its destructor.
   allocation_stack_->Reset();
+  allocation_records_.reset();
   live_stack_->Reset();
   STLDeleteValues(&mod_union_tables_);
   STLDeleteValues(&remembered_sets_);
@@ -3653,5 +3655,18 @@
   }
 }
 
+void Heap::SetAllocationRecords(AllocRecordObjectMap* records) {
+  allocation_records_.reset(records);
+}
+
+void Heap::SweepAllocationRecords(IsMarkedCallback* visitor, void* arg) const {
+  if (IsAllocTrackingEnabled()) {
+    MutexLock mu(Thread::Current(), *Locks::alloc_tracker_lock_);
+    if (IsAllocTrackingEnabled()) {
+      GetAllocationRecords()->SweepAllocationRecords(visitor, arg);
+    }
+  }
+}
+
 }  // namespace gc
 }  // namespace art
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 81a9741..09bd370 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -58,6 +58,7 @@
 
 namespace gc {
 
+class AllocRecordObjectMap;
 class ReferenceProcessor;
 class TaskProcessor;
 
@@ -683,6 +684,27 @@
   void DumpGcCountRateHistogram(std::ostream& os) const;
   void DumpBlockingGcCountRateHistogram(std::ostream& os) const;
 
+  // Allocation tracking support
+  // Callers to this function use double-checked locking to ensure safety on allocation_records_
+  bool IsAllocTrackingEnabled() const {
+    return alloc_tracking_enabled_.LoadRelaxed();
+  }
+
+  void SetAllocTrackingEnabled(bool enabled) EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    alloc_tracking_enabled_.StoreRelaxed(enabled);
+  }
+
+  AllocRecordObjectMap* GetAllocationRecords() const
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_) {
+    return allocation_records_.get();
+  }
+
+  void SetAllocationRecords(AllocRecordObjectMap* records)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::alloc_tracker_lock_);
+
+  void SweepAllocationRecords(IsMarkedCallback* visitor, void* arg) const
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
  private:
   class ConcurrentGCTask;
   class CollectorTransitionTask;
@@ -1191,6 +1213,11 @@
   // The histogram of the number of blocking GC invocations per window duration.
   Histogram<uint64_t> blocking_gc_count_rate_histogram_ GUARDED_BY(gc_complete_lock_);
 
+  // Allocation tracking support
+  Atomic<bool> alloc_tracking_enabled_;
+  std::unique_ptr<AllocRecordObjectMap> allocation_records_
+      GUARDED_BY(Locks::alloc_tracker_lock_);
+
   friend class CollectorTransitionTask;
   friend class collector::GarbageCollector;
   friend class collector::MarkCompact;
diff --git a/runtime/hprof/hprof.cc b/runtime/hprof/hprof.cc
index 6e0e56e..85eefb0 100644
--- a/runtime/hprof/hprof.cc
+++ b/runtime/hprof/hprof.cc
@@ -48,6 +48,7 @@
 #include "dex_file-inl.h"
 #include "gc_root.h"
 #include "gc/accounting/heap_bitmap.h"
+#include "gc/allocation_record.h"
 #include "gc/heap.h"
 #include "gc/space/space.h"
 #include "globals.h"
@@ -68,7 +69,6 @@
 static constexpr bool kDirectStream = true;
 
 static constexpr uint32_t kHprofTime = 0;
-static constexpr uint32_t kHprofNullStackTrace = 0;
 static constexpr uint32_t kHprofNullThread = 0;
 
 static constexpr size_t kMaxObjectsPerSegment = 128;
@@ -144,6 +144,10 @@
 
 typedef uint32_t HprofStringId;
 typedef uint32_t HprofClassObjectId;
+typedef uint32_t HprofClassSerialNumber;
+typedef uint32_t HprofStackTraceSerialNumber;
+typedef uint32_t HprofStackFrameId;
+static constexpr HprofStackTraceSerialNumber kHprofNullStackTrace = 0;
 
 class EndianOutput {
  public:
@@ -194,6 +198,10 @@
     AddU4(PointerToLowMemUInt32(value));
   }
 
+  void AddStackTraceSerialNumber(HprofStackTraceSerialNumber value) {
+    AddU4(value);
+  }
+
   // The ID for the synthetic object generated to account for class static overhead.
   void AddClassStaticsId(const mirror::Class* value) {
     AddU4(1 | PointerToLowMemUInt32(value));
@@ -415,13 +423,21 @@
         start_ns_(NanoTime()),
         current_heap_(HPROF_HEAP_DEFAULT),
         objects_in_segment_(0),
-        next_string_id_(0x400000) {
+        next_string_id_(0x400000),
+        next_class_serial_number_(1) {
     LOG(INFO) << "hprof: heap dump \"" << filename_ << "\" starting...";
   }
 
   void Dump()
       EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_)
-      LOCKS_EXCLUDED(Locks::heap_bitmap_lock_) {
+      LOCKS_EXCLUDED(Locks::heap_bitmap_lock_, Locks::alloc_tracker_lock_) {
+    {
+      MutexLock mu(Thread::Current(), *Locks::alloc_tracker_lock_);
+      if (Runtime::Current()->GetHeap()->IsAllocTrackingEnabled()) {
+        PopulateAllocationTrackingTraces();
+      }
+    }
+
     // First pass to measure the size of the dump.
     size_t overall_size;
     size_t max_length;
@@ -480,11 +496,11 @@
     objects_in_segment_ = 0;
 
     if (header_first) {
-      ProcessHeader();
+      ProcessHeader(true);
       ProcessBody();
     } else {
       ProcessBody();
-      ProcessHeader();
+      ProcessHeader(false);
     }
   }
 
@@ -501,21 +517,29 @@
     output_->EndRecord();
   }
 
-  void ProcessHeader() EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  void ProcessHeader(bool string_first) EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Write the header.
     WriteFixedHeader();
     // Write the string and class tables, and any stack traces, to the header.
     // (jhat requires that these appear before any of the data in the body that refers to them.)
-    WriteStringTable();
+    // jhat also requires the string table appear before class table and stack traces.
+    // However, WriteStackTraces() can modify the string table, so it's necessary to call
+    // WriteStringTable() last in the first pass, to compute the correct length of the output.
+    if (string_first) {
+      WriteStringTable();
+    }
     WriteClassTable();
     WriteStackTraces();
+    if (!string_first) {
+      WriteStringTable();
+    }
     output_->EndRecord();
   }
 
   void WriteClassTable() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
-    uint32_t nextSerialNumber = 1;
-
-    for (mirror::Class* c : classes_) {
+    for (const auto& p : classes_) {
+      mirror::Class* c = p.first;
+      HprofClassSerialNumber sn = p.second;
       CHECK(c != nullptr);
       output_->StartNewRecord(HPROF_TAG_LOAD_CLASS, kHprofTime);
       // LOAD CLASS format:
@@ -523,9 +547,9 @@
       // ID: class object ID. We use the address of the class object structure as its ID.
       // U4: stack trace serial number
       // ID: class name string ID
-      __ AddU4(nextSerialNumber++);
+      __ AddU4(sn);
       __ AddObjectId(c);
-      __ AddU4(kHprofNullStackTrace);
+      __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(c));
       __ AddStringId(LookupClassNameId(c));
     }
   }
@@ -567,15 +591,31 @@
 
   HprofClassObjectId LookupClassId(mirror::Class* c) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     if (c != nullptr) {
-      auto result = classes_.insert(c);
-      const mirror::Class* present = *result.first;
-      CHECK_EQ(present, c);
-      // Make sure that we've assigned a string ID for this class' name
-      LookupClassNameId(c);
+      auto it = classes_.find(c);
+      if (it == classes_.end()) {
+        // first time to see this class
+        HprofClassSerialNumber sn = next_class_serial_number_++;
+        classes_.Put(c, sn);
+        // Make sure that we've assigned a string ID for this class' name
+        LookupClassNameId(c);
+      }
     }
     return PointerToLowMemUInt32(c);
   }
 
+  HprofStackTraceSerialNumber LookupStackTraceSerialNumber(const mirror::Object* obj)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+    auto r = allocation_records_.find(obj);
+    if (r == allocation_records_.end()) {
+      return kHprofNullStackTrace;
+    } else {
+      const gc::AllocRecordStackTrace* trace = r->second;
+      auto result = traces_.find(trace);
+      CHECK(result != traces_.end());
+      return result->second;
+    }
+  }
+
   HprofStringId LookupStringId(mirror::String* string) SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     return LookupStringId(string->ToModifiedUtf8());
   }
@@ -622,12 +662,66 @@
     __ AddU4(static_cast<uint32_t>(nowMs & 0xFFFFFFFF));
   }
 
-  void WriteStackTraces() {
+  void WriteStackTraces() SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
     // Write a dummy stack trace record so the analysis tools don't freak out.
     output_->StartNewRecord(HPROF_TAG_STACK_TRACE, kHprofTime);
-    __ AddU4(kHprofNullStackTrace);
+    __ AddStackTraceSerialNumber(kHprofNullStackTrace);
     __ AddU4(kHprofNullThread);
     __ AddU4(0);    // no frames
+
+    // TODO: jhat complains "WARNING: Stack trace not found for serial # -1", but no trace should
+    // have -1 as its serial number (as long as HprofStackTraceSerialNumber doesn't overflow).
+    for (const auto& it : traces_) {
+      const gc::AllocRecordStackTrace* trace = it.first;
+      HprofStackTraceSerialNumber trace_sn = it.second;
+      size_t depth = trace->GetDepth();
+
+      // First write stack frames of the trace
+      for (size_t i = 0; i < depth; ++i) {
+        const gc::AllocRecordStackTraceElement* frame = &trace->GetStackElement(i);
+        ArtMethod* method = frame->GetMethod();
+        CHECK(method != nullptr);
+        output_->StartNewRecord(HPROF_TAG_STACK_FRAME, kHprofTime);
+        // STACK FRAME format:
+        // ID: stack frame ID. We use the address of the AllocRecordStackTraceElement object as its ID.
+        // ID: method name string ID
+        // ID: method signature string ID
+        // ID: source file name string ID
+        // U4: class serial number
+        // U4: >0, line number; 0, no line information available; -1, unknown location
+        auto frame_result = frames_.find(frame);
+        CHECK(frame_result != frames_.end());
+        __ AddU4(frame_result->second);
+        __ AddStringId(LookupStringId(method->GetName()));
+        __ AddStringId(LookupStringId(method->GetSignature().ToString()));
+        const char* source_file = method->GetDeclaringClassSourceFile();
+        if (source_file == nullptr) {
+          source_file = "";
+        }
+        __ AddStringId(LookupStringId(source_file));
+        auto class_result = classes_.find(method->GetDeclaringClass());
+        CHECK(class_result != classes_.end());
+        __ AddU4(class_result->second);
+        __ AddU4(frame->ComputeLineNumber());
+      }
+
+      // Then write the trace itself
+      output_->StartNewRecord(HPROF_TAG_STACK_TRACE, kHprofTime);
+      // STACK TRACE format:
+      // U4: stack trace serial number. We use the address of the AllocRecordStackTrace object as its serial number.
+      // U4: thread serial number. We use Thread::GetTid().
+      // U4: number of frames
+      // [ID]*: series of stack frame ID's
+      __ AddStackTraceSerialNumber(trace_sn);
+      __ AddU4(trace->GetTid());
+      __ AddU4(depth);
+      for (size_t i = 0; i < depth; ++i) {
+        const gc::AllocRecordStackTraceElement* frame = &trace->GetStackElement(i);
+        auto frame_result = frames_.find(frame);
+        CHECK(frame_result != frames_.end());
+        __ AddU4(frame_result->second);
+      }
+    }
   }
 
   bool DumpToDdmsBuffered(size_t overall_size ATTRIBUTE_UNUSED, size_t max_length ATTRIBUTE_UNUSED)
@@ -723,6 +817,40 @@
     return true;
   }
 
+  void PopulateAllocationTrackingTraces()
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::mutator_lock_, Locks::alloc_tracker_lock_) {
+    gc::AllocRecordObjectMap* records = Runtime::Current()->GetHeap()->GetAllocationRecords();
+    CHECK(records != nullptr);
+    HprofStackTraceSerialNumber next_trace_sn = kHprofNullStackTrace + 1;
+    HprofStackFrameId next_frame_id = 0;
+
+    for (auto it = records->Begin(), end = records->End(); it != end; ++it) {
+      const mirror::Object* obj = it->first.Read();
+      const gc::AllocRecordStackTrace* trace = it->second->GetStackTrace();
+
+      // Copy the pair into a real hash map to speed up look up.
+      auto records_result = allocation_records_.emplace(obj, trace);
+      // The insertion should always succeed, i.e. no duplicate object pointers in "records"
+      CHECK(records_result.second);
+
+      // Generate serial numbers for traces, and IDs for frames.
+      auto traces_result = traces_.find(trace);
+      if (traces_result == traces_.end()) {
+        traces_.emplace(trace, next_trace_sn++);
+        // only check frames if the trace is newly discovered
+        for (size_t i = 0, depth = trace->GetDepth(); i < depth; ++i) {
+          const gc::AllocRecordStackTraceElement* frame = &trace->GetStackElement(i);
+          auto frames_result = frames_.find(frame);
+          if (frames_result == frames_.end()) {
+            frames_.emplace(frame, next_frame_id++);
+          }
+        }
+      }
+    }
+    CHECK_EQ(traces_.size(), next_trace_sn - kHprofNullStackTrace - 1);
+    CHECK_EQ(frames_.size(), next_frame_id);
+  }
+
   // If direct_to_ddms_ is set, "filename_" and "fd" will be ignored.
   // Otherwise, "filename_" must be valid, though if "fd" >= 0 it will
   // only be used for debug messages.
@@ -737,9 +865,18 @@
   HprofHeapId current_heap_;  // Which heap we're currently dumping.
   size_t objects_in_segment_;
 
-  std::set<mirror::Class*> classes_;
   HprofStringId next_string_id_;
   SafeMap<std::string, HprofStringId> strings_;
+  HprofClassSerialNumber next_class_serial_number_;
+  SafeMap<mirror::Class*, HprofClassSerialNumber> classes_;
+
+  std::unordered_map<const gc::AllocRecordStackTrace*, HprofStackTraceSerialNumber,
+                     gc::HashAllocRecordTypesPtr<gc::AllocRecordStackTrace>,
+                     gc::EqAllocRecordTypesPtr<gc::AllocRecordStackTrace>> traces_;
+  std::unordered_map<const gc::AllocRecordStackTraceElement*, HprofStackFrameId,
+                     gc::HashAllocRecordTypesPtr<gc::AllocRecordStackTraceElement>,
+                     gc::EqAllocRecordTypesPtr<gc::AllocRecordStackTraceElement>> frames_;
+  std::unordered_map<const mirror::Object*, const gc::AllocRecordStackTrace*> allocation_records_;
 
   DISALLOW_COPY_AND_ASSIGN(Hprof);
 };
@@ -881,10 +1018,6 @@
   ++objects_in_segment_;
 }
 
-static int StackTraceSerialNumber(const mirror::Object* /*obj*/) {
-  return kHprofNullStackTrace;
-}
-
 void Hprof::DumpHeapObject(mirror::Object* obj) {
   // Ignore classes that are retired.
   if (obj->IsClass() && obj->AsClass()->IsRetired()) {
@@ -966,7 +1099,7 @@
     // StaticField array at the end of this class.
     __ AddU1(HPROF_PRIMITIVE_ARRAY_DUMP);
     __ AddClassStaticsId(klass);
-    __ AddU4(StackTraceSerialNumber(klass));
+    __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(klass));
     __ AddU4(byteLength);
     __ AddU1(hprof_basic_byte);
     for (int i = 0; i < byteLength; ++i) {
@@ -976,7 +1109,7 @@
 
   __ AddU1(HPROF_CLASS_DUMP);
   __ AddClassId(LookupClassId(klass));
-  __ AddU4(StackTraceSerialNumber(klass));
+  __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(klass));
   __ AddClassId(LookupClassId(klass->GetSuperClass()));
   __ AddObjectId(klass->GetClassLoader());
   __ AddObjectId(nullptr);    // no signer
@@ -1072,7 +1205,7 @@
     __ AddU1(HPROF_OBJECT_ARRAY_DUMP);
 
     __ AddObjectId(obj);
-    __ AddU4(StackTraceSerialNumber(obj));
+    __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
     __ AddU4(length);
     __ AddClassId(LookupClassId(klass));
 
@@ -1087,7 +1220,7 @@
     __ AddU1(HPROF_PRIMITIVE_ARRAY_DUMP);
 
     __ AddObjectId(obj);
-    __ AddU4(StackTraceSerialNumber(obj));
+    __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
     __ AddU4(length);
     __ AddU1(t);
 
@@ -1108,7 +1241,7 @@
   // obj is an instance object.
   __ AddU1(HPROF_INSTANCE_DUMP);
   __ AddObjectId(obj);
-  __ AddU4(StackTraceSerialNumber(obj));
+  __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
   __ AddClassId(LookupClassId(klass));
 
   // Reserve some space for the length of the instance data, which we won't
@@ -1170,7 +1303,7 @@
 
     __ AddU1(HPROF_PRIMITIVE_ARRAY_DUMP);
     __ AddObjectId(value);
-    __ AddU4(StackTraceSerialNumber(obj));
+    __ AddStackTraceSerialNumber(LookupStackTraceSerialNumber(obj));
     __ AddU4(s->GetLength());
     __ AddU1(hprof_basic_char);
     __ AddU2List(s->GetValue(), s->GetLength());
diff --git a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
index b96ddc8..9ce4a02 100644
--- a/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
+++ b/runtime/native/org_apache_harmony_dalvik_ddmc_DdmVmInternal.cc
@@ -38,7 +38,7 @@
 }
 
 static jboolean DdmVmInternal_getRecentAllocationStatus(JNIEnv*, jclass) {
-  return Dbg::IsAllocTrackingEnabled();
+  return Runtime::Current()->GetHeap()->IsAllocTrackingEnabled();
 }
 
 /*
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 9d651bf..cb7f5f3 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -402,6 +402,7 @@
   GetInternTable()->SweepInternTableWeaks(visitor, arg);
   GetMonitorList()->SweepMonitorList(visitor, arg);
   GetJavaVM()->SweepJniWeakGlobals(visitor, arg);
+  GetHeap()->SweepAllocationRecords(visitor, arg);
 }
 
 bool Runtime::Create(const RuntimeOptions& options, bool ignore_unrecognized) {
@@ -1471,6 +1472,11 @@
   monitor_list_->DisallowNewMonitors();
   intern_table_->DisallowNewInterns();
   java_vm_->DisallowNewWeakGlobals();
+  // TODO: add a similar call for heap.allocation_records_, otherwise some of the newly allocated
+  // objects that are not marked might be swept from the records, making the records incomplete.
+  // It is safe for now since the only effect is that those objects do not have allocation records.
+  // The number of such objects should be small, and current allocation tracker cannot collect
+  // allocation records for all objects anyway.
 }
 
 void Runtime::AllowNewSystemWeaks() {