Merge "Dump dex file sizes in oatdump." into dalvik-dev
diff --git a/build/Android.common.mk b/build/Android.common.mk
index 5d89c40..178af64 100644
--- a/build/Android.common.mk
+++ b/build/Android.common.mk
@@ -181,6 +181,7 @@
 	src/dlmalloc.cc \
 	src/file.cc \
 	src/file_linux.cc \
+	src/gc/barrier.cc \
 	src/gc/card_table.cc \
 	src/gc/heap_bitmap.cc \
 	src/gc/large_object_space.cc \
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index 21f824d..7305271 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -216,18 +216,26 @@
   int rTgt = loadHelper(cUnit, helperOffset);
 #endif
   if (arg0.wide == 0) {
-    loadValueDirectFixed(cUnit, arg0, rARG0);
+    loadValueDirectFixed(cUnit, arg0, arg0.fp ? rFARG0 : rARG0);
     if (arg1.wide == 0) {
+#if defined(TARGET_MIPS)
+      loadValueDirectFixed(cUnit, arg1, arg1.fp ? rFARG2 : rARG1);
+#else
       loadValueDirectFixed(cUnit, arg1, rARG1);
+#endif
     } else {
+#if defined(TARGET_MIPS)
+      loadValueDirectWideFixed(cUnit, arg1, arg1.fp ? rFARG2 : rARG1, arg1.fp ? rFARG3 : rARG2);
+#else
       loadValueDirectWideFixed(cUnit, arg1, rARG1, rARG2);
+#endif
     }
   } else {
-    loadValueDirectWideFixed(cUnit, arg0, rARG0, rARG1);
+    loadValueDirectWideFixed(cUnit, arg0, arg0.fp ? rFARG0 : rARG0, arg0.fp ? rFARG1 : rARG1);
     if (arg1.wide == 0) {
-      loadValueDirectFixed(cUnit, arg1, rARG2);
+      loadValueDirectFixed(cUnit, arg1, arg1.fp ? rFARG2 : rARG2);
     } else {
-      loadValueDirectWideFixed(cUnit, arg1, rARG2, rARG3);
+      loadValueDirectWideFixed(cUnit, arg1, arg1.fp ? rFARG2 : rARG2, arg1.fp ? rFARG3 : rARG3);
     }
   }
   oatClobberCalleeSave(cUnit);
@@ -1400,7 +1408,7 @@
       // InitializeTypeFromCode(idx, method)
       callRuntimeHelperImmReg(cUnit, ENTRYPOINT_OFFSET(pInitializeTypeFromCode), type_idx, rARG1,
                               true);
-      opRegCopy(cUnit, classReg, rARG0); // Align usage with fast path
+      opRegCopy(cUnit, classReg, rRET0); // Align usage with fast path
       // Rejoin code paths
       LIR* hopTarget = newLIR0(cUnit, kPseudoTargetLabel);
       hopBranch->target = (LIR*)hopTarget;
@@ -2363,9 +2371,9 @@
    */
   oatFlushAllRegs(cUnit);   /* Send everything to home location */
   if (rlSrc.wide) {
-    loadValueDirectWideFixed(cUnit, rlSrc, rARG0, rARG1);
+    loadValueDirectWideFixed(cUnit, rlSrc, rlSrc.fp ? rFARG0 : rARG0, rlSrc.fp ? rFARG1 : rARG1);
   } else {
-    loadValueDirectFixed(cUnit, rlSrc, rARG0);
+    loadValueDirectFixed(cUnit, rlSrc, rlSrc.fp ? rFARG0 : rARG0);
   }
   callRuntimeHelperRegLocation(cUnit, funcOffset, rlSrc, false);
   if (rlDest.wide) {
diff --git a/src/compiler/codegen/arm/ArmLIR.h b/src/compiler/codegen/arm/ArmLIR.h
index a40df20..1da567e 100644
--- a/src/compiler/codegen/arm/ArmLIR.h
+++ b/src/compiler/codegen/arm/ArmLIR.h
@@ -251,6 +251,10 @@
 #define rARG1 r1
 #define rARG2 r2
 #define rARG3 r3
+#define rFARG0 r0
+#define rFARG1 r1
+#define rFARG2 r2
+#define rFARG3 r3
 #define rRET0 r0
 #define rRET1 r1
 #define rINVOKE_TGT rLR
diff --git a/src/compiler/codegen/mips/FP/MipsFP.cc b/src/compiler/codegen/mips/FP/MipsFP.cc
index 437f1ed..990c71f 100644
--- a/src/compiler/codegen/mips/FP/MipsFP.cc
+++ b/src/compiler/codegen/mips/FP/MipsFP.cc
@@ -197,11 +197,11 @@
   oatFlushAllRegs(cUnit);
   oatLockCallTemps(cUnit);
   if (wide) {
-    loadValueDirectWideFixed(cUnit, rlSrc1, r_FARG0, r_FARG1);
-    loadValueDirectWideFixed(cUnit, rlSrc2, r_FARG2, r_FARG3);
+    loadValueDirectWideFixed(cUnit, rlSrc1, rFARG0, rFARG1);
+    loadValueDirectWideFixed(cUnit, rlSrc2, rFARG2, rFARG3);
   } else {
-    loadValueDirectFixed(cUnit, rlSrc1, r_FARG0);
-    loadValueDirectFixed(cUnit, rlSrc2, r_FARG2);
+    loadValueDirectFixed(cUnit, rlSrc1, rFARG0);
+    loadValueDirectFixed(cUnit, rlSrc2, rFARG2);
   }
   int rTgt = loadHelper(cUnit, offset);
   // NOTE: not a safepoint
diff --git a/src/compiler/codegen/mips/MipsLIR.h b/src/compiler/codegen/mips/MipsLIR.h
index 5077c9f..4b7da55 100644
--- a/src/compiler/codegen/mips/MipsLIR.h
+++ b/src/compiler/codegen/mips/MipsLIR.h
@@ -301,6 +301,10 @@
 #define rARG1 r_ARG1
 #define rARG2 r_ARG2
 #define rARG3 r_ARG3
+#define rFARG0 r_FARG0
+#define rFARG1 r_FARG1
+#define rFARG2 r_FARG2
+#define rFARG3 r_FARG3
 #define rRET0 r_RESULT0
 #define rRET1 r_RESULT1
 #define rINVOKE_TGT r_T9
diff --git a/src/compiler/codegen/x86/X86LIR.h b/src/compiler/codegen/x86/X86LIR.h
index d3e3da8..8ad014a 100644
--- a/src/compiler/codegen/x86/X86LIR.h
+++ b/src/compiler/codegen/x86/X86LIR.h
@@ -238,6 +238,10 @@
 #define rARG1 rCX
 #define rARG2 rDX
 #define rARG3 rBX
+#define rFARG0 rAX
+#define rFARG1 rCX
+#define rFARG2 rDX
+#define rFARG3 rBX
 #define rRET0 rAX
 #define rRET1 rDX
 #define rINVOKE_TGT rAX
diff --git a/src/compiler_llvm/method_compiler.cc b/src/compiler_llvm/method_compiler.cc
index f91694e..fea0c72 100644
--- a/src/compiler_llvm/method_compiler.cc
+++ b/src/compiler_llvm/method_compiler.cc
@@ -4344,7 +4344,7 @@
     case Instruction::SGET_CHAR:
     case Instruction::SGET_SHORT:
       {
-        if (insn->Opcode() == Instruction::AGET_OBJECT) {
+        if (insn->Opcode() == Instruction::SGET_OBJECT) {
           set_to_another_object[dec_insn.vA] = true;
         }
         uint32_t field_idx = dec_insn.vB;
diff --git a/src/gc/barrier.cc b/src/gc/barrier.cc
new file mode 100644
index 0000000..aa9433b
--- /dev/null
+++ b/src/gc/barrier.cc
@@ -0,0 +1,46 @@
+#include "barrier.h"
+#include "../mutex.h"
+#include "thread.h"
+
+namespace art {
+
+Barrier::Barrier()
+    : count_(0),
+      lock_("GC barrier lock"),
+      condition_("GC barrier condition", lock_) {
+}
+
+void Barrier::Pass(Thread* self) {
+  MutexLock mu(self, lock_);
+  SetCountLocked(self, count_ - 1);
+}
+
+void Barrier::Wait(Thread* self) {
+  Increment(self, -1);
+}
+
+void Barrier::Init(Thread* self, int count) {
+  MutexLock mu(self, lock_);
+  SetCountLocked(self, count);
+}
+
+void Barrier::Increment(Thread* self, int delta) {
+  MutexLock mu(self, lock_);
+  SetCountLocked(self, count_ + delta);
+  if (count_ != 0) {
+    condition_.Wait(self);
+  }
+}
+
+void Barrier::SetCountLocked(Thread* self, int count) {
+  count_ = count;
+  if (count_ == 0) {
+    condition_.Broadcast(self);
+  }
+}
+
+Barrier::~Barrier() {
+  CHECK(!count_) << "Attempted to destory barrier with non zero count";
+}
+
+}
diff --git a/src/gc/barrier.h b/src/gc/barrier.h
new file mode 100644
index 0000000..207536a
--- /dev/null
+++ b/src/gc/barrier.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2012 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ART_SRC_GC_BARRIER_H_
+#define ART_SRC_GC_BARRIER_H_
+
+#include "../mutex.h"
+#include "locks.h"
+#include "UniquePtr.h"
+
+namespace art {
+
+class Barrier {
+ public:
+  Barrier();
+  virtual ~Barrier();
+
+  // Pass through the barrier, decrements the count but does not block.
+  void Pass(Thread* self);
+
+  // Wait on the barrier, decrement the count.
+  void Wait(Thread* self);
+
+  // Set the count to a new value, if the value is 0 then everyone waiting on the condition
+  // variable is resumed.
+  void Init(Thread* self, int count);
+
+  // Increment the count by delta, wait on condition if count is non zero.
+  void Increment(Thread* self, int delta);
+
+ private:
+  void SetCountLocked(Thread* self, int count) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+
+  // Counter, when this reaches 0 all people blocked on the barrier are signalled.
+  int count_ GUARDED_BY(lock_);
+
+  Mutex lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
+  ConditionVariable condition_ GUARDED_BY(lock_);
+};
+
+}  // namespace art
+#endif  // ART_SRC_GC_BARRIER_H_
+
diff --git a/src/gc/card_table.h b/src/gc/card_table.h
index 2715f52..035c073 100644
--- a/src/gc/card_table.h
+++ b/src/gc/card_table.h
@@ -160,4 +160,4 @@
 };
 
 }  // namespace art
-#endif  // DALVIK_ALLOC_CARDTABLE_H_
+#endif  // ART_SRC_GC_CARDTABLE_H_
diff --git a/src/gc/mark_sweep.cc b/src/gc/mark_sweep.cc
index 980eed1..0869e26 100644
--- a/src/gc/mark_sweep.cc
+++ b/src/gc/mark_sweep.cc
@@ -19,6 +19,7 @@
 #include <climits>
 #include <vector>
 
+#include "barrier.h"
 #include "card_table.h"
 #include "class_loader.h"
 #include "dex_cache.h"
@@ -36,11 +37,12 @@
 #include "timing_logger.h"
 #include "thread.h"
 #include "thread_list.h"
-
-static const bool kUseMarkStackPrefetch = true;
+#include "verifier/method_verifier.h"
 
 namespace art {
 
+static const bool kUseMarkStackPrefetch = true;
+
 class SetFingerVisitor {
  public:
   SetFingerVisitor(MarkSweep* const mark_sweep) : mark_sweep_(mark_sweep) {
@@ -68,7 +70,8 @@
       phantom_reference_list_(NULL),
       cleared_reference_list_(NULL),
       freed_bytes_(0), freed_objects_(0),
-      class_count_(0), array_count_(0), other_count_(0) {
+      class_count_(0), array_count_(0), other_count_(0),
+      gc_barrier_(new Barrier) {
   DCHECK(mark_stack_ != NULL);
 }
 
@@ -183,9 +186,10 @@
     LargeObjectSpace* large_object_space = GetHeap()->GetLargeObjectsSpace();
     if (!large_object_space->Contains(root)) {
       LOG(ERROR) << "Found invalid root: " << root;
-      LOG(ERROR) << "VReg / Shadow frame offset: " << vreg;
+      LOG(ERROR) << "VReg: " << vreg;
       if (method != NULL) {
-        LOG(ERROR) << "In method " << PrettyMethod(method, true);
+        LOG(ERROR) << "In method " << PrettyMethod(method, true) << "\nVerifier output:\n";
+        verifier::MethodVerifier::VerifyMethodAndDump(const_cast<AbstractMethod*>(method));
       }
     }
   }
@@ -200,6 +204,10 @@
   Runtime::Current()->VisitNonConcurrentRoots(MarkObjectVisitor, this);
 }
 
+void MarkSweep::MarkNonThreadRoots() {
+  Runtime::Current()->VisitNonThreadRoots(MarkObjectVisitor, this);
+}
+
 void MarkSweep::MarkConcurrentRoots() {
   Runtime::Current()->VisitConcurrentRoots(MarkObjectVisitor, this);
 }
@@ -519,6 +527,38 @@
   Thread* self;
 };
 
+class CheckpointMarkThreadRoots : public Thread::CheckpointFunction {
+ public:
+  CheckpointMarkThreadRoots(MarkSweep* mark_sweep) : mark_sweep_(mark_sweep) {
+
+  }
+
+  virtual void Run(Thread* thread) NO_THREAD_SAFETY_ANALYSIS {
+    // Note: self is not necessarily equal to thread since thread may be suspended.
+    Thread* self = Thread::Current();
+    DCHECK(thread == self || thread->IsSuspended() || thread->GetState() == kWaitingPerformingGc);
+    WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
+    thread->VisitRoots(MarkSweep::MarkObjectVisitor, mark_sweep_);
+    mark_sweep_->GetBarrier().Pass(self);
+  }
+
+ private:
+  MarkSweep* mark_sweep_;
+};
+
+Barrier& MarkSweep::GetBarrier() {
+  return *gc_barrier_;
+}
+
+void MarkSweep::MarkRootsCheckpoint() {
+  UniquePtr<CheckpointMarkThreadRoots> check_point(new CheckpointMarkThreadRoots(this));
+  ThreadList* thread_list = Runtime::Current()->GetThreadList();
+  // Increment the count of the barrier. If all of the checkpoints have already been finished then
+  // will hit 0 and continue. Otherwise we are still waiting for some checkpoints, so the counter
+  // will go positive and we will unblock when it hits zero.
+  gc_barrier_->Increment(Thread::Current(), thread_list->RunCheckpoint(check_point.get()));
+}
+
 void MarkSweep::SweepCallback(size_t num_ptrs, Object** ptrs, void* arg) {
   size_t freed_objects = num_ptrs;
   size_t freed_bytes = 0;
@@ -538,8 +578,7 @@
     freed_bytes += space->FreeList(self, num_ptrs, ptrs);
   } else {
     for (size_t i = 0; i < num_ptrs; ++i) {
-      Object* obj = static_cast<Object*>(ptrs[i]);
-      freed_bytes += space->Free(self, obj);
+      freed_bytes += space->Free(self, ptrs[i]);
     }
   }
 
diff --git a/src/gc/mark_sweep.h b/src/gc/mark_sweep.h
index ed74f99..d64439f 100644
--- a/src/gc/mark_sweep.h
+++ b/src/gc/mark_sweep.h
@@ -25,6 +25,7 @@
 
 namespace art {
 
+class Barrier;
 class CheckObjectVisitor;
 class Class;
 class Heap;
@@ -52,9 +53,15 @@
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  void MarkNonThreadRoots()
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
   void MarkConcurrentRoots();
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
+  void MarkRootsCheckpoint();
+       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
   // Verify that image roots point to only marked objects within the alloc space.
   void VerifyImageRoots() EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -183,6 +190,11 @@
     }
   }
 
+  static void MarkObjectVisitor(const Object* root, void* arg)
+      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
+
+  Barrier& GetBarrier();
+
  private:
   // Returns true if the object has its bit set in the mark bitmap.
   bool IsMarked(const Object* object) const;
@@ -193,9 +205,6 @@
   static bool IsMarkedArrayCallback(const Object* object, void* arg)
       SHARED_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
-  static void MarkObjectVisitor(const Object* root, void* arg)
-      EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
-
   static void ReMarkObjectVisitor(const Object* root, void* arg)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::heap_bitmap_lock_);
 
@@ -417,6 +426,8 @@
   size_t array_count_;
   size_t other_count_;
 
+  UniquePtr<Barrier> gc_barrier_;
+
   friend class AddIfReachesAllocSpaceVisitor; // Used by mod-union table.
   friend class CheckBitmapVisitor;
   friend class CheckObjectVisitor;
diff --git a/src/heap.cc b/src/heap.cc
index 584c5b2..33f8366 100644
--- a/src/heap.cc
+++ b/src/heap.cc
@@ -1509,16 +1509,12 @@
 void Heap::CollectGarbageConcurrentMarkSweepPlan(Thread* self, GcType gc_type, GcCause gc_cause,
                                                  bool clear_soft_references) {
   TimingLogger timings("ConcurrentCollectGarbageInternal", true);
-  uint64_t root_begin = NanoTime(), root_end = 0, dirty_begin = 0, dirty_end = 0;
+  uint64_t gc_begin = NanoTime(), root_begin = 0, root_end = 0, dirty_begin = 0, dirty_end = 0;
   std::stringstream gc_type_str;
   gc_type_str << gc_type << " ";
 
-  // Suspend all threads are get exclusive access to the heap.
   ThreadList* thread_list = Runtime::Current()->GetThreadList();
-  thread_list->SuspendAll();
-  timings.AddSplit("SuspendAll");
-  Locks::mutator_lock_->AssertExclusiveHeld(self);
-
+  std::vector<byte*> dirty_cards;
   size_t bytes_freed = 0;
   Object* cleared_references = NULL;
   {
@@ -1528,48 +1524,9 @@
     mark_sweep.Init();
     timings.AddSplit("Init");
 
-    if (verify_pre_gc_heap_) {
-      WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-      if (!VerifyHeapReferences()) {
-        LOG(FATAL) << "Pre " << gc_type_str.str() << "Gc verification failed";
-      }
-      timings.AddSplit("VerifyHeapReferencesPreGC");
-    }
-
-    // Swap the stacks, this is safe since all the mutators are suspended at this point.
-    SwapStacks();
-
-    // Check that all objects which reference things in the live stack are on dirty cards.
-    if (verify_missing_card_marks_) {
-      ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-      // Sort the live stack so that we can quickly binary search it later.
-      std::sort(live_stack_->Begin(), live_stack_->End());
-      if (!VerifyMissingCardMarks()) {
-        LOG(FATAL) << "Pre GC verification of missing card marks failed";
-      }
-    }
-
-    // We will need to know which cards were dirty for doing concurrent processing of dirty cards.
-    // TODO: Investigate using a mark stack instead of a vector.
-    std::vector<byte*> dirty_cards;
-    if (gc_type == kGcTypeSticky) {
-      dirty_cards.reserve(4 * KB);
-      for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
-        card_table_->GetDirtyCards(*it, dirty_cards);
-      }
-      timings.AddSplit("GetDirtyCards");
-    }
-
-    // Clear image space cards and keep track of cards we cleared in the mod-union table.
-    ClearCards(timings);
-
     {
       WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
 
-      for (Object** it = live_stack_->Begin(); it != live_stack_->End(); ++it) {
-        DCHECK(!GetLiveBitmap()->Test(*it));
-      }
-
       if (gc_type == kGcTypePartial) {
         // Copy the mark bits over from the live bits, do this as early as possible or else we can
         // accidentally un-mark roots.
@@ -1595,25 +1552,65 @@
                                   reinterpret_cast<Object*>(alloc_space_->Begin()));
       }
       mark_sweep.FindDefaultMarkBitmap();
+    }
 
-      // Marking roots is not necessary for sticky mark bits since we only actually require the
-      // remarking of roots.
-      if (gc_type != kGcTypeSticky) {
-        mark_sweep.MarkRoots();
-        timings.AddSplit("MarkRoots");
+    mark_sweep.MarkRootsCheckpoint();
+    timings.AddSplit("MarkRootsCheckpoint");
+
+    {
+      root_begin = NanoTime();
+
+      // Suspend all threads are get exclusive access to the heap.
+      thread_list->SuspendAll();
+      timings.AddSplit("SuspendAll");
+      Locks::mutator_lock_->AssertExclusiveHeld(self);
+
+      if (verify_pre_gc_heap_) {
+        WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
+        if (!VerifyHeapReferences()) {
+          LOG(FATAL) << "Pre " << gc_type_str.str() << "Gc verification failed";
+        }
+        timings.AddSplit("VerifyHeapReferencesPreGC");
       }
 
-      if (verify_mod_union_table_) {
-        zygote_mod_union_table_->Update();
-        zygote_mod_union_table_->Verify();
-        mod_union_table_->Update();
-        mod_union_table_->Verify();
+      // Swap the stacks, this is safe since all the mutators are suspended at this point.
+      SwapStacks();
+
+      // Check that all objects which reference things in the live stack are on dirty cards.
+      if (verify_missing_card_marks_) {
+        ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
+        // Sort the live stack so that we can quickly binary search it later.
+        std::sort(live_stack_->Begin(), live_stack_->End());
+        if (!VerifyMissingCardMarks()) {
+          LOG(FATAL) << "Pre GC verification of missing card marks failed";
+        }
       }
+
+      // We will need to know which cards were dirty for doing concurrent processing of dirty cards.
+      // TODO: Investigate using a mark stack instead of a vector.
+      if (gc_type == kGcTypeSticky) {
+        dirty_cards.reserve(4 * KB);
+        for (Spaces::iterator it = spaces_.begin(); it != spaces_.end(); ++it) {
+          card_table_->GetDirtyCards(*it, dirty_cards);
+        }
+        timings.AddSplit("GetDirtyCards");
+      }
+
+      // Clear image space cards and keep track of cards we cleared in the mod-union table.
+      ClearCards(timings);
     }
 
     // Roots are marked on the bitmap and the mark_stack is empty.
     DCHECK(mark_stack_->IsEmpty());
 
+    if (verify_mod_union_table_) {
+      ReaderMutexLock reader_lock(self, *Locks::heap_bitmap_lock_);
+      zygote_mod_union_table_->Update();
+      zygote_mod_union_table_->Verify();
+      mod_union_table_->Update();
+      mod_union_table_->Verify();
+    }
+
     // Allow mutators to go again, acquire share on mutator_lock_ to continue.
     thread_list->ResumeAll();
     {
@@ -1622,11 +1619,11 @@
       timings.AddSplit("RootEnd");
 
       // Mark the roots which we can do concurrently.
+      WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
       mark_sweep.MarkConcurrentRoots();
       timings.AddSplit("MarkConcurrentRoots");
-
-      WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
-      UpdateAndMarkModUnion(&mark_sweep, timings, gc_type);
+      mark_sweep.MarkNonThreadRoots();
+      timings.AddSplit("MarkNonThreadRoots");
 
       if (gc_type != kGcTypeSticky) {
         // Mark everything allocated since the last as GC live so that we can sweep concurrently,
@@ -1636,6 +1633,8 @@
         timings.AddSplit("MarkStackAsLive");
       }
 
+      UpdateAndMarkModUnion(&mark_sweep, timings, gc_type);
+
       if (gc_type != kGcTypeSticky) {
         // Recursively mark all the non-image bits set in the mark bitmap.
         mark_sweep.RecursiveMark(gc_type == kGcTypePartial, timings);
@@ -1775,7 +1774,7 @@
   // If the GC was slow, then print timings in the log.
   uint64_t pause_roots = (root_end - root_begin) / 1000 * 1000;
   uint64_t pause_dirty = (dirty_end - dirty_begin) / 1000 * 1000;
-  uint64_t duration = (NanoTime() - root_begin) / 1000 * 1000;
+  uint64_t duration = (NanoTime() - gc_begin) / 1000 * 1000;
   total_paused_time_ += pause_roots + pause_dirty;
   if (pause_roots > MsToNs(5) || pause_dirty > MsToNs(5) ||
       (gc_cause == kGcCauseForAlloc && duration > MsToNs(20))) {
diff --git a/src/oat/runtime/mips/runtime_support_mips.S b/src/oat/runtime/mips/runtime_support_mips.S
index eafd380..349e802 100644
--- a/src/oat/runtime/mips/runtime_support_mips.S
+++ b/src/oat/runtime/mips/runtime_support_mips.S
@@ -353,11 +353,11 @@
     sw    $t0, 16($sp)                    # pass $sp
     addiu $sp, $sp, 16                    # release out args
     move  $a0, $v0                        # save target Method*
-    move  $t0, $v1                        # save $v0->code_
+    move  $t9, $v1                        # save $v0->code_
     RESTORE_REF_AND_ARGS_CALLEE_SAVE_FRAME
     beqz  $v0, 1f
     nop
-    jr    $t0
+    jr    $t9
     nop
 1:
     DELIVER_PENDING_EXCEPTION
@@ -389,13 +389,13 @@
     move     $a0, rSELF       # pass Thread::Current
     jal      artWorkAroundAppJniBugs  # (Thread*, $sp)
     move     $a1, $sp         # pass $sp
-    move     $t0, $v0         # save target address
+    move     $t9, $v0         # save target address
     lw       $a0, 12($sp)
     lw       $a1, 16($sp)
     lw       $a2, 20($sp)
     lw       $a3, 24($sp)
     lw       $ra, 28($sp)
-    jr       $t0              # tail call into JNI routine
+    jr       $t9              # tail call into JNI routine
     addiu    $sp, $sp, 32
 
     .global art_handle_fill_data_from_code
diff --git a/src/oat/runtime/support_proxy.cc b/src/oat/runtime/support_proxy.cc
index 3d0c1c8..7ece784 100644
--- a/src/oat/runtime/support_proxy.cc
+++ b/src/oat/runtime/support_proxy.cc
@@ -30,8 +30,8 @@
 #define ARG2_OFFSET_IN_WORDS 11 // offset to 3rd arg; skip callee saves, LR, Method* and out arg spills for OUT0 to OUT2
 #elif defined(__mips__)
 #define SP_OFFSET_IN_BYTES 8
-#define FRAME_SIZE_IN_BYTES 32u
-#define ARG2_OFFSET_IN_WORDS 8 // offset to 3rd arg; skip callee saves, LR, Method* and out arg spills for OUT0 to OUT2
+#define FRAME_SIZE_IN_BYTES 48u
+#define ARG2_OFFSET_IN_WORDS 12 // offset to 3rd arg; skip callee saves, LR, Method* and out arg spills for OUT0 to OUT2
 #elif defined(__i386__)
 #define SP_OFFSET_IN_BYTES 8
 #define FRAME_SIZE_IN_BYTES 32u
diff --git a/src/oat/runtime/support_thread.cc b/src/oat/runtime/support_thread.cc
index 2eef424..c33b42e 100644
--- a/src/oat/runtime/support_thread.cc
+++ b/src/oat/runtime/support_thread.cc
@@ -20,18 +20,32 @@
 
 namespace art {
 
+static void CheckSuspend(Thread* thread)
+    SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
+  for (;;) {
+    if (thread->ReadFlag(kCheckpointRequest)) {
+      thread->RunCheckpointFunction();
+      thread->AtomicClearFlag(kCheckpointRequest);
+    } else if (thread->ReadFlag(kSuspendRequest)) {
+      thread->FullSuspendCheck();
+    } else {
+      break;
+    }
+  }
+}
+
 void CheckSuspendFromCode(Thread* thread)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   // Called when thread->suspend_count_ != 0 on JNI return. JNI method acts as callee-save frame.
   thread->VerifyStack();
-  thread->FullSuspendCheck();
+  CheckSuspend(thread);
 }
 
 extern "C" void artTestSuspendFromCode(Thread* thread, AbstractMethod** sp)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   // Called when suspend count check value is 0 and thread->suspend_count_ != 0
   FinishCalleeSaveFrameSetup(thread, sp, Runtime::kRefsOnly);
-  thread->FullSuspendCheck();
+  CheckSuspend(thread);
 }
 
 }  // namespace art
diff --git a/src/runtime.cc b/src/runtime.cc
index 3a5c41c..7bc1b70 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -1002,10 +1002,9 @@
   }
 }
 
-void Runtime::VisitNonConcurrentRoots(Heap::RootVisitor* visitor, void* arg) {
+void Runtime::VisitNonThreadRoots(Heap::RootVisitor* visitor, void* arg) {
   Dbg::VisitRoots(visitor, arg);
   java_vm_->VisitRoots(visitor, arg);
-  thread_list_->VisitRoots(visitor, arg);
   if (pre_allocated_OutOfMemoryError_ != NULL) {
     visitor(pre_allocated_OutOfMemoryError_, arg);
   }
@@ -1020,6 +1019,11 @@
   }
 }
 
+void Runtime::VisitNonConcurrentRoots(Heap::RootVisitor* visitor, void* arg) {
+  thread_list_->VisitRoots(visitor, arg);
+  VisitNonThreadRoots(visitor, arg);
+}
+
 void Runtime::DirtyRoots() {
   intern_table_->Dirty();
   class_linker_->Dirty();
diff --git a/src/runtime.h b/src/runtime.h
index 44823a0..02f5206 100644
--- a/src/runtime.h
+++ b/src/runtime.h
@@ -234,6 +234,9 @@
   // Visit all of the roots we can do safely do concurrently.
   void VisitConcurrentRoots(Heap::RootVisitor* visitor, void* arg);
 
+  // Visit all of the non thread roots, we can do this with mutators unpaused.
+  void VisitNonThreadRoots(Heap::RootVisitor* visitor, void* arg);
+
   // Visit all other roots which must be done with mutators suspended.
   void VisitNonConcurrentRoots(Heap::RootVisitor* visitor, void* arg)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
diff --git a/src/scoped_thread_state_change.h b/src/scoped_thread_state_change.h
index 32c3685..39f5c3f 100644
--- a/src/scoped_thread_state_change.h
+++ b/src/scoped_thread_state_change.h
@@ -49,6 +49,7 @@
         // A suspended transition to another effectively suspended transition, ok to use Unsafe.
         self_->SetState(new_thread_state);
       }
+
       if (runnable_transition && old_thread_state_ != new_thread_state) {
         if (new_thread_state == kRunnable) {
           self_->TransitionFromSuspendedToRunnable();
diff --git a/src/signal_catcher.cc b/src/signal_catcher.cc
index 80c37d4..b54c819 100644
--- a/src/signal_catcher.cc
+++ b/src/signal_catcher.cc
@@ -145,8 +145,11 @@
     }
   }
   os << "----- end " << getpid() << " -----\n";
-
   CHECK_EQ(self->SetStateUnsafe(old_state), kRunnable);
+  if (self->ReadFlag(kCheckpointRequest)) {
+    self->RunCheckpointFunction();
+    self->AtomicClearFlag(kCheckpointRequest);
+  }
   self->EndAssertNoThreadSuspension(old_cause);
   thread_list->ResumeAll();
 
@@ -186,7 +189,7 @@
   CHECK(runtime->AttachCurrentThread("Signal Catcher", true, runtime->GetSystemThreadGroup()));
 
   Thread* self = Thread::Current();
-
+  DCHECK_NE(self->GetState(), kRunnable);
   {
     MutexLock mu(self, signal_catcher->lock_);
     signal_catcher->thread_ = self;
diff --git a/src/thread.cc b/src/thread.cc
index cea919f..67773a5 100644
--- a/src/thread.cc
+++ b/src/thread.cc
@@ -555,6 +555,19 @@
   }
 }
 
+bool Thread::RequestCheckpoint(CheckpointFunction* function) {
+  CHECK(!ReadFlag(kCheckpointRequest)) << "Already have a pending checkpoint request";
+  checkpoint_function_ = function;
+  union StateAndFlags old_state_and_flags = state_and_flags_;
+  // We must be runnable to request a checkpoint.
+  old_state_and_flags.as_struct.state = kRunnable;
+  union StateAndFlags new_state_and_flags = old_state_and_flags;
+  new_state_and_flags.as_struct.flags |= kCheckpointRequest;
+  int succeeded = android_atomic_cmpxchg(old_state_and_flags.as_int, new_state_and_flags.as_int,
+                                         &state_and_flags_.as_int);
+  return succeeded == 0;
+}
+
 void Thread::FullSuspendCheck() {
   VLOG(threads) << this << " self-suspending";
   // Make thread appear suspended to other threads, release mutator_lock_.
@@ -570,7 +583,20 @@
   DCHECK_EQ(this, Thread::Current());
   // Change to non-runnable state, thereby appearing suspended to the system.
   DCHECK_EQ(GetState(), kRunnable);
-  state_and_flags_.as_struct.state = new_state;
+  union StateAndFlags old_state_and_flags;
+  union StateAndFlags new_state_and_flags;
+  do {
+    old_state_and_flags = state_and_flags_;
+    // Copy over flags and try to clear the checkpoint bit if it is set.
+    new_state_and_flags.as_struct.flags = old_state_and_flags.as_struct.flags & ~kCheckpointRequest;
+    new_state_and_flags.as_struct.state = new_state;
+  } while (android_atomic_cmpxchg(old_state_and_flags.as_int, new_state_and_flags.as_int,
+                                  &state_and_flags_.as_int) != 0);
+  // If we toggled the checkpoint flag we must have cleared it.
+  uint16_t flag_change = new_state_and_flags.as_struct.flags ^ old_state_and_flags.as_struct.flags;
+  if ((flag_change & kCheckpointRequest) != 0) {
+    RunCheckpointFunction();
+  }
   // Release share on mutator_lock_.
   Locks::mutator_lock_->SharedUnlock(this);
 }
@@ -935,6 +961,7 @@
       pthread_self_(0),
       no_thread_suspension_(0),
       last_no_thread_suspension_cause_(NULL),
+      checkpoint_function_(0),
       thread_exit_check_count_(0) {
   CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
   state_and_flags_.as_struct.flags = 0;
diff --git a/src/thread.h b/src/thread.h
index d559449..abfd719 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -101,10 +101,17 @@
   kSuspendRequest   = 1,  // If set implies that suspend_count_ > 0.
   kExceptionPending = 2,  // If set implies that exception_ != NULL.
   kEnterInterpreter = 4,  // Instruct managed code it should enter the interpreter.
+  kCheckpointRequest = 8, // Request that the thread do some set of work.
 };
 
 class PACKED Thread {
  public:
+  class CheckpointFunction {
+   public:
+    virtual ~CheckpointFunction() { }
+    virtual void Run(Thread* self) = 0;
+  };
+
   // Space to throw a StackOverflowError in.
 #if !defined(ART_USE_LLVM_COMPILER)
   static const size_t kStackOverflowReservedBytes = 4 * KB;
@@ -167,13 +174,17 @@
     return debug_suspend_count_;
   }
 
-  bool IsSuspended() const EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_) {
-    return GetState() != kRunnable && ReadFlag(kSuspendRequest);
+  bool IsSuspended() const {
+    union StateAndFlags state_and_flags = state_and_flags_;
+    return state_and_flags.as_struct.state != kRunnable &&
+        (state_and_flags.as_struct.flags & kSuspendRequest) != 0;
   }
 
   void ModifySuspendCount(Thread* self, int delta, bool for_debugger)
       EXCLUSIVE_LOCKS_REQUIRED(Locks::thread_suspend_count_lock_);
 
+  bool RequestCheckpoint(CheckpointFunction* function);
+
   // Called when thread detected that the thread_suspend_count_ was non-zero. Gives up share of
   // mutator_lock_ and waits until it is resumed and thread_suspend_count_ is zero.
   void FullSuspendCheck()
@@ -570,6 +581,19 @@
     held_mutexes_[level] = mutex;
   }
 
+  void RunCheckpointFunction() {
+    CHECK(checkpoint_function_ != NULL);
+    checkpoint_function_->Run(this);
+  }
+
+  bool ReadFlag(ThreadFlag flag) const {
+    return (state_and_flags_.as_struct.flags & flag) != 0;
+  }
+
+  void AtomicSetFlag(ThreadFlag flag);
+
+  void AtomicClearFlag(ThreadFlag flag);
+
  private:
   // We have no control over the size of 'bool', but want our boolean fields
   // to be 4-byte quantities.
@@ -617,14 +641,6 @@
 
   void NotifyLocked(Thread* self) EXCLUSIVE_LOCKS_REQUIRED(wait_mutex_);
 
-  bool ReadFlag(ThreadFlag flag) const {
-    return (state_and_flags_.as_struct.flags & flag) != 0;
-  }
-
-  void AtomicSetFlag(ThreadFlag flag);
-
-  void AtomicClearFlag(ThreadFlag flag);
-
   static void ThreadExitCallback(void* arg);
 
   // TLS key used to retrieve the Thread*.
@@ -759,6 +775,9 @@
   // Cause for last suspension.
   const char* last_no_thread_suspension_cause_;
 
+  // Pending checkpoint functions.
+  CheckpointFunction* checkpoint_function_;
+
  public:
   // Runtime support function pointers
   // TODO: move this near the top, since changing its offset requires all oats to be recompiled!
diff --git a/src/thread_list.cc b/src/thread_list.cc
index 601a93f..4ad25ae 100644
--- a/src/thread_list.cc
+++ b/src/thread_list.cc
@@ -151,6 +151,79 @@
 }
 #endif
 
+size_t ThreadList::RunCheckpoint(Thread::CheckpointFunction* checkpoint_function) {
+  Thread* self = Thread::Current();
+  if (kIsDebugBuild) {
+    Locks::mutator_lock_->AssertNotHeld(self);
+    Locks::thread_list_lock_->AssertNotHeld(self);
+    Locks::thread_suspend_count_lock_->AssertNotHeld(self);
+    CHECK_NE(self->GetState(), kRunnable);
+  }
+
+  std::vector<Thread*> suspended_count_modified_threads;
+  size_t count = 0;
+  {
+    // Call a checkpoint function for each thread, threads which are suspend get their checkpoint
+    // manually called.
+    MutexLock mu(self, *Locks::thread_list_lock_);
+    // TODO: C++0x auto.
+    for (It it = list_.begin(), end = list_.end(); it != end; ++it) {
+      Thread* thread = *it;
+      if (thread != self) {
+        for (;;) {
+          if (thread->RequestCheckpoint(checkpoint_function)) {
+            // This thread will run it's checkpoint some time in the near future.
+            count++;
+            break;
+          } else {
+            // We are probably suspended, try to make sure that we stay suspended.
+            MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+            // The thread switched back to runnable.
+            if (thread->GetState() == kRunnable) {
+              continue;
+            }
+            thread->ModifySuspendCount(self, +1, false);
+            suspended_count_modified_threads.push_back(thread);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // Run the checkpoint on ourself while we wait for threads to suspend.
+  checkpoint_function->Run(self);
+
+  // Run the checkpoint on the suspended threads.
+  for (size_t i = 0; i < suspended_count_modified_threads.size(); ++i) {
+    Thread* thread = suspended_count_modified_threads[i];
+    if (!thread->IsSuspended()) {
+      // Wait until the thread is suspended.
+      uint64_t start = NanoTime();
+      do {
+        // Sleep for 100us.
+        usleep(100);
+      } while (!thread->IsSuspended());
+      uint64_t end = NanoTime();
+      // Shouldn't need to wait for longer than 1 millisecond.
+      const uint64_t threshold = 1;
+      if (NsToMs(end - start) > threshold) {
+        LOG(INFO) << "Warning: waited longer than " << threshold << " ms for thrad suspend"
+                  << std::endl;
+      }
+    }
+    // We know for sure that the thread is suspended at this point.
+    thread->RunCheckpointFunction();
+    {
+      MutexLock mu2(self, *Locks::thread_suspend_count_lock_);
+      thread->ModifySuspendCount(self, -1, false);
+    }
+  }
+
+  // Add one for self.
+  return count + suspended_count_modified_threads.size() + 1;
+}
+
 void ThreadList::SuspendAll() {
   Thread* self = Thread::Current();
 
diff --git a/src/thread_list.h b/src/thread_list.h
index 3142fd3..a41fa57 100644
--- a/src/thread_list.h
+++ b/src/thread_list.h
@@ -55,6 +55,12 @@
       LOCKS_EXCLUDED(Locks::thread_list_lock_,
                      Locks::thread_suspend_count_lock_);
 
+  // Run a checkpoint on threads, running threads are not suspended but run the checkpoint inside
+  // of the suspend check. Returns how many checkpoints we should expect to run.
+  size_t RunCheckpoint(Thread::CheckpointFunction* checkpoint_function);
+      LOCKS_EXCLUDED(Locks::thread_list_lock_,
+                     Locks::thread_suspend_count_lock_);
+
   // Suspends all threads
   void SuspendAllForDebugger()
       LOCKS_EXCLUDED(Locks::mutator_lock_,
diff --git a/src/verifier/method_verifier.h b/src/verifier/method_verifier.h
index 8951b13..6765c10 100644
--- a/src/verifier/method_verifier.h
+++ b/src/verifier/method_verifier.h
@@ -162,6 +162,9 @@
                                  std::string& error)
       SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
+  static void VerifyMethodAndDump(AbstractMethod* method)
+      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
+
   uint8_t EncodePcToReferenceMapData() const;
 
   uint32_t DexFileVersion() const {
@@ -235,8 +238,6 @@
       ClassLoader* class_loader, uint32_t class_def_idx, const DexFile::CodeItem* code_item,
       AbstractMethod* method, uint32_t method_access_flags)
           SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
-  static void VerifyMethodAndDump(AbstractMethod* method)
-      SHARED_LOCKS_REQUIRED(Locks::mutator_lock_);
 
   // Run verification on the method. Returns true if verification completes and false if the input
   // has an irrecoverable corruption.
@@ -653,8 +654,6 @@
   // The number of occurrences of specific opcodes.
   size_t new_instance_count_;
   size_t monitor_enter_count_;
-
-  friend struct art::ReferenceMap2Visitor; // for VerifyMethodAndDump
 };
 std::ostream& operator<<(std::ostream& os, const MethodVerifier::FailureKind& rhs);