Add native memory accounting through custom allocator.

Added a custom allocator that lets you pass in a special tag which
specifices where the allocation came from. This is used when
dumping. The performance overhead is low since each allocation only
does a atomic add/sub for each allocation/free.

The measurements are dumped to traces.txt during SIGQUIT.

Example output:
I/art     (27274): AllocatorTagHeap active=120 max=120 total=168
I/art     (27274): AllocatorTagMonitorList active=1572 max=6240 total=11724
I/art     (27274): AllocatorTagClassTable active=185208 max=185208 total=268608
I/art     (27274): AllocatorTagInternTable active=430368 max=430368 total=436080
I/art     (27274): AllocatorTagMaps active=5616 max=6168 total=34392
I/art     (27274): AllocatorTagLOS active=1024 max=1536 total=2044
I/art     (27274): AllocatorTagSafeMap active=0 max=51936 total=533688
I/art     (27274): AllocatorTagLOSMaps active=144 max=1248 total=5760
I/art     (27274): AllocatorTagReferenceTable active=10944 max=11840 total=19136
I/art     (27274): AllocatorTagHeapBitmap active=32 max=40 total=56
I/art     (27274): AllocatorTagHeapBitmapLOS active=8 max=8 total=8
I/art     (27274): AllocatorTagVerifier active=0 max=18844 total=1073156
I/art     (27274): AllocatorTagModUnionCardSet active=5300 max=5920 total=56020
I/art     (27274): AllocatorTagModUnionReferenceArray active=24864 max=24864 total=24864
I/art     (27274): AllocatorTagJNILibrarires active=320 max=320 total=320
I/art     (27274): AllocatorTagOatFile active=1400 max=1400 total=5852

Change-Id: Ibb470ef2e9c9a24563bb46422d46a55799704d82

(cherry picked from commit 5369c40f75fdcb1be7a7c06db212ce965c83a164)
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 6a91501..cd066fc 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -23,6 +23,7 @@
 #include <memory>
 #include <vector>
 
+#include "base/allocator.h"
 #include "base/histogram-inl.h"
 #include "base/stl_util.h"
 #include "common_throws.h"
@@ -146,7 +147,6 @@
       total_objects_freed_ever_(0),
       num_bytes_allocated_(0),
       native_bytes_allocated_(0),
-      gc_memory_overhead_(0),
       verify_missing_card_marks_(false),
       verify_system_weaks_(false),
       verify_pre_gc_heap_(verify_pre_gc_heap),
@@ -790,14 +790,6 @@
   }
 }
 
-void Heap::RegisterGCAllocation(size_t bytes) {
-  gc_memory_overhead_.FetchAndAddSequentiallyConsistent(bytes);
-}
-
-void Heap::RegisterGCDeAllocation(size_t bytes) {
-  gc_memory_overhead_.FetchAndSubSequentiallyConsistent(bytes);
-}
-
 void Heap::DumpGcPerformanceInfo(std::ostream& os) {
   // Dump cumulative timings.
   os << "Dumping cumulative Gc timings\n";
@@ -839,7 +831,6 @@
   }
   os << "Total mutator paused time: " << PrettyDuration(total_paused_time) << "\n";
   os << "Total time waiting for GC to complete: " << PrettyDuration(total_wait_time_) << "\n";
-  os << "Approximate GC data structures memory overhead: " << gc_memory_overhead_.LoadRelaxed();
   BaseMutex::DumpAll(os);
 }