Enable moving classes.

Slight reduction in Zygote size, memory savings are in the noise.
Before: Zygote size: 8739224
After: Zygote size: 8733568

Fixed a bug where we didn't set the concurrent start bytes after
switching the allocator from bump pointer to ROSAlloc in the
zygote. This caused excessive memory usage.

Added the method verifiers as roots to fix an issue caused by
RegTypes holding a Class*.

Added logic to clear card table in the SemiSpace collector, this
reduces DalvikOther from ~2400k -> ~1760k when using the SemiSpace
collector.

Added a missing lock to the timing loggers which caused a rare
one time crash in std::set.

Bug: 11771255
Bug: 8499494
Bug: 10802951

Change-Id: I99d2b528cd51c1c5ed7012e3220b3aefded680ae
diff --git a/runtime/gc/accounting/card_table.cc b/runtime/gc/accounting/card_table.cc
index 7818bc8..e099137 100644
--- a/runtime/gc/accounting/card_table.cc
+++ b/runtime/gc/accounting/card_table.cc
@@ -95,8 +95,8 @@
 }
 
 void CardTable::ClearCardTable() {
-  // TODO: clear just the range of the table that has been modified
-  memset(mem_map_->Begin(), kCardClean, mem_map_->Size());
+  COMPILE_ASSERT(kCardClean == 0, clean_card_must_be_0);
+  madvise(mem_map_->Begin(), mem_map_->Size(), MADV_DONTNEED);
 }
 
 bool CardTable::AddrIsInCardTable(const void* addr) const {
diff --git a/runtime/gc/accounting/mod_union_table.cc b/runtime/gc/accounting/mod_union_table.cc
index faa198a..b428e74 100644
--- a/runtime/gc/accounting/mod_union_table.cc
+++ b/runtime/gc/accounting/mod_union_table.cc
@@ -82,7 +82,7 @@
     if (ref != nullptr) {
       Object* new_ref = visitor_(ref, arg_);
       if (new_ref != ref) {
-        obj->SetFieldObject(offset, new_ref, true);
+        obj->SetFieldPtr(offset, new_ref, true);
       }
     }
   }
diff --git a/runtime/gc/collector/mark_sweep-inl.h b/runtime/gc/collector/mark_sweep-inl.h
index 7a51553..9c1c5dc 100644
--- a/runtime/gc/collector/mark_sweep-inl.h
+++ b/runtime/gc/collector/mark_sweep-inl.h
@@ -69,15 +69,14 @@
   DCHECK(obj->GetClass() != NULL);
   mirror::Class* klass = obj->GetClass();
   DCHECK(klass != NULL);
-  if (visit_class) {
-    visitor(obj, klass, mirror::Object::ClassOffset(), false);
-  }
   if (klass == mirror::Class::GetJavaLangClass()) {
     DCHECK_EQ(klass->GetClass(), mirror::Class::GetJavaLangClass());
     VisitClassReferences(klass, obj, visitor);
   } else {
     if (klass->IsArrayClass()) {
-      visitor(obj, klass, mirror::Object::ClassOffset(), false);
+      if (visit_class) {
+        visitor(obj, klass, mirror::Object::ClassOffset(), false);
+      }
       if (klass->IsObjectArrayClass()) {
         VisitObjectArrayReferences(obj->AsObjectArray<mirror::Object>(), visitor);
       }
diff --git a/runtime/gc/collector/mark_sweep.h b/runtime/gc/collector/mark_sweep.h
index 53d85b0..62991bb 100644
--- a/runtime/gc/collector/mark_sweep.h
+++ b/runtime/gc/collector/mark_sweep.h
@@ -89,10 +89,12 @@
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void MarkNonThreadRoots()
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
-  void MarkConcurrentRoots();
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+  void MarkConcurrentRoots()
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void MarkRootsCheckpoint(Thread* self)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index 3939354..63e0cfa 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -173,6 +173,10 @@
   BindBitmaps();
   // Process dirty cards and add dirty cards to mod-union tables.
   heap_->ProcessCards(timings_);
+  // Clear the whole card table since we can not get any additional dirty cards during the
+  // paused GC. This saves memory but only works for pause the world collectors.
+  timings_.NewSplit("ClearCardTable");
+  heap_->GetCardTable()->ClearCardTable();
   // Need to do this before the checkpoint since we don't want any threads to add references to
   // the live stack during the recursive mark.
   timings_.NewSplit("SwapStacks");
@@ -318,8 +322,6 @@
         memcpy(reinterpret_cast<void*>(forward_address), obj, object_size);
         // Make sure to only update the forwarding address AFTER you copy the object so that the
         // monitor word doesn't get stomped over.
-        COMPILE_ASSERT(sizeof(uint32_t) == sizeof(mirror::Object*),
-                       monitor_size_must_be_same_as_object);
         obj->SetLockWord(LockWord::FromForwardingAddress(reinterpret_cast<size_t>(forward_address)));
         MarkStackPush(forward_address);
       }
@@ -508,7 +510,10 @@
     mirror::Object* new_address = MarkObject(ref);
     if (new_address != ref) {
       DCHECK(new_address != nullptr);
-      obj->SetFieldObject(offset, new_address, false);
+      // Don't need to mark the card since we updating the object address and not changing the
+      // actual objects its pointing to. Using SetFieldPtr is better in this case since it does not
+      // dirty cards and use additional memory.
+      obj->SetFieldPtr(offset, new_address, false);
     }
   }, kMovingClasses);
   mirror::Class* klass = obj->GetClass();
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 08ab6b8..99f084a 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -41,24 +41,20 @@
   // done in the runnable state where suspension is expected.
   DCHECK_EQ(self->GetState(), kRunnable);
   self->AssertThreadSuspensionIsAllowable();
+  // Need to check that we arent the large object allocator since the large object allocation code
+  // path this function. If we didn't check we would have an infinite loop.
+  if (allocator != kAllocatorTypeLOS && UNLIKELY(ShouldAllocLargeObject(klass, byte_count))) {
+    return AllocLargeObject<kInstrumented, PreFenceVisitor>(self, klass, byte_count,
+                                                            pre_fence_visitor);
+  }
   mirror::Object* obj;
   size_t bytes_allocated;
   AllocationTimer alloc_timer(this, &obj);
-  if (UNLIKELY(ShouldAllocLargeObject(klass, byte_count))) {
-    obj = TryToAllocate<kInstrumented>(self, kAllocatorTypeLOS, byte_count, false,
-                                       &bytes_allocated);
-    allocator = kAllocatorTypeLOS;
-  } else {
-    obj = TryToAllocate<kInstrumented>(self, allocator, byte_count, false, &bytes_allocated);
-  }
-
+  obj = TryToAllocate<kInstrumented, false>(self, allocator, byte_count, &bytes_allocated);
   if (UNLIKELY(obj == nullptr)) {
-    SirtRef<mirror::Class> sirt_c(self, klass);
-    obj = AllocateInternalWithGc(self, allocator, byte_count, &bytes_allocated);
+    obj = AllocateInternalWithGc(self, allocator, byte_count, &bytes_allocated, &klass);
     if (obj == nullptr) {
       return nullptr;
-    } else {
-      klass = sirt_c.get();
     }
   }
   obj->SetClass(klass);
@@ -105,11 +101,19 @@
   return obj;
 }
 
-template <const bool kInstrumented>
+template <bool kInstrumented, typename PreFenceVisitor>
+inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class* klass,
+                                              size_t byte_count,
+                                              const PreFenceVisitor& pre_fence_visitor) {
+  return AllocObjectWithAllocator<kInstrumented, PreFenceVisitor>(self, klass, byte_count,
+                                                                  kAllocatorTypeLOS,
+                                                                  pre_fence_visitor);
+}
+
+template <const bool kInstrumented, const bool kGrow>
 inline mirror::Object* Heap::TryToAllocate(Thread* self, AllocatorType allocator_type,
-                                           size_t alloc_size, bool grow,
-                                           size_t* bytes_allocated) {
-  if (UNLIKELY(IsOutOfMemoryOnAllocation(alloc_size, grow))) {
+                                           size_t alloc_size, size_t* bytes_allocated) {
+  if (UNLIKELY(IsOutOfMemoryOnAllocation<kGrow>(alloc_size))) {
     return nullptr;
   }
   if (kInstrumented) {
@@ -190,14 +194,15 @@
   return byte_count >= kLargeObjectThreshold && have_zygote_space_ && c->IsPrimitiveArray();
 }
 
-inline bool Heap::IsOutOfMemoryOnAllocation(size_t alloc_size, bool grow) {
+template <const bool kGrow>
+inline bool Heap::IsOutOfMemoryOnAllocation(size_t alloc_size) {
   size_t new_footprint = num_bytes_allocated_ + alloc_size;
   if (UNLIKELY(new_footprint > max_allowed_footprint_)) {
     if (UNLIKELY(new_footprint > growth_limit_)) {
       return true;
     }
     if (!concurrent_gc_) {
-      if (!grow) {
+      if (!kGrow) {
         return true;
       }
       // TODO: Grow for allocation is racy, fix it.
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index f92a821..11acd33 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -881,14 +881,17 @@
 }
 
 mirror::Object* Heap::AllocateInternalWithGc(Thread* self, AllocatorType allocator,
-                                             size_t alloc_size, size_t* bytes_allocated) {
+                                             size_t alloc_size, size_t* bytes_allocated,
+                                             mirror::Class** klass) {
   mirror::Object* ptr = nullptr;
+  DCHECK(klass != nullptr);
+  SirtRef<mirror::Class> sirt_klass(self, *klass);
   // The allocation failed. If the GC is running, block until it completes, and then retry the
   // allocation.
   collector::GcType last_gc = WaitForGcToComplete(self);
   if (last_gc != collector::kGcTypeNone) {
     // A GC was in progress and we blocked, retry allocation now that memory has been freed.
-    ptr = TryToAllocate<true>(self, allocator, alloc_size, false, bytes_allocated);
+    ptr = TryToAllocate<true, false>(self, allocator, alloc_size, bytes_allocated);
   }
 
   // Loop through our different Gc types and try to Gc until we get enough free memory.
@@ -899,13 +902,13 @@
     // Attempt to run the collector, if we succeed, re-try the allocation.
     if (CollectGarbageInternal(gc_type, kGcCauseForAlloc, false) != collector::kGcTypeNone) {
       // Did we free sufficient memory for the allocation to succeed?
-      ptr = TryToAllocate<true>(self, allocator, alloc_size, false, bytes_allocated);
+      ptr = TryToAllocate<true, false>(self, allocator, alloc_size, bytes_allocated);
     }
   }
   // Allocations have failed after GCs;  this is an exceptional state.
   if (ptr == nullptr) {
     // Try harder, growing the heap if necessary.
-    ptr = TryToAllocate<true>(self, allocator, alloc_size, true, bytes_allocated);
+    ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated);
   }
   if (ptr == nullptr) {
     // Most allocations should have succeeded by now, so the heap is really full, really fragmented,
@@ -918,11 +921,12 @@
     // We don't need a WaitForGcToComplete here either.
     DCHECK(!gc_plan_.empty());
     CollectGarbageInternal(gc_plan_.back(), kGcCauseForAlloc, true);
-    ptr = TryToAllocate<true>(self, allocator, alloc_size, true, bytes_allocated);
+    ptr = TryToAllocate<true, true>(self, allocator, alloc_size, bytes_allocated);
     if (ptr == nullptr) {
       ThrowOutOfMemoryError(self, alloc_size, false);
     }
   }
+  *klass = sirt_klass.get();
   return ptr;
 }
 
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 3bff3f9..9788064 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -126,11 +126,6 @@
   kProcessStateJankImperceptible = 1,
 };
 
-// If true, measure the total allocation time.
-static constexpr bool kMeasureAllocationTime = false;
-// Primitive arrays larger than this size are put in the large object space.
-static constexpr size_t kLargeObjectThreshold = 3 * kPageSize;
-
 class Heap {
  public:
   // If true, measure the total allocation time.
@@ -522,10 +517,16 @@
   ALWAYS_INLINE void CheckConcurrentGC(Thread* self, size_t new_num_bytes_allocated,
                                        mirror::Object* obj);
 
+  // We don't force this to be inline since it is a slow path.
+  template <bool kInstrumented, typename PreFenceVisitor>
+  mirror::Object* AllocLargeObject(Thread* self, mirror::Class* klass, size_t byte_count,
+                                   const PreFenceVisitor& pre_fence_visitor)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   // Handles Allocate()'s slow allocation path with GC involved after
   // an initial allocation attempt failed.
   mirror::Object* AllocateInternalWithGc(Thread* self, AllocatorType allocator, size_t num_bytes,
-                                         size_t* bytes_allocated)
+                                         size_t* bytes_allocated, mirror::Class** klass)
       LOCKS_EXCLUDED(Locks::thread_suspend_count_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
@@ -536,15 +537,15 @@
 
   // Try to allocate a number of bytes, this function never does any GCs. Needs to be inlined so
   // that the switch statement is constant optimized in the entrypoints.
-  template <const bool kInstrumented>
+  template <const bool kInstrumented, const bool kGrow>
   ALWAYS_INLINE mirror::Object* TryToAllocate(Thread* self, AllocatorType allocator_type,
-                                              size_t alloc_size, bool grow,
-                                              size_t* bytes_allocated)
+                                              size_t alloc_size, size_t* bytes_allocated)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   void ThrowOutOfMemoryError(Thread* self, size_t byte_count, bool large_object_allocation)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  bool IsOutOfMemoryOnAllocation(size_t alloc_size, bool grow);
+  template <const bool kGrow>
+  bool IsOutOfMemoryOnAllocation(size_t alloc_size);
 
   // Pushes a list of cleared references out to the managed heap.
   void SetReferenceReferent(mirror::Object* reference, mirror::Object* referent)