Merge "Fixed result storage of instruction with long constant operand"
diff --git a/Android.mk b/Android.mk
index 62d40bb..6ad5606 100644
--- a/Android.mk
+++ b/Android.mk
@@ -143,7 +143,7 @@
 	@echo test-art-host-interpreter PASSED
 
 .PHONY: test-art-host-dependencies
-test-art-host-dependencies: $(ART_HOST_TEST_DEPENDENCIES) $(HOST_OUT_SHARED_LIBRARIES)/libarttest$(ART_HOST_SHLIB_EXTENSION) $(HOST_CORE_DEX_LOCATIONS)
+test-art-host-dependencies: $(ART_HOST_TEST_DEPENDENCIES) $(HOST_OUT_SHARED_LIBRARIES)/libarttest$(ART_HOST_SHLIB_EXTENSION) $(HOST_CORE_DEX_LOCATIONS) $(HOST_OUT_EXECUTABLES)/jasmin
 
 .PHONY: test-art-host-gtest
 test-art-host-gtest: $(ART_HOST_TEST_TARGETS)
@@ -209,7 +209,7 @@
 	@echo test-art-target PASSED
 
 .PHONY: test-art-target-dependencies
-test-art-target-dependencies: $(ART_TARGET_TEST_DEPENDENCIES) $(ART_TEST_OUT)/libarttest.so
+test-art-target-dependencies: $(ART_TARGET_TEST_DEPENDENCIES) $(ART_TEST_OUT)/libarttest.so $(HOST_OUT_EXECUTABLES)/jasmin
 
 .PHONY: test-art-target-sync
 test-art-target-sync: test-art-target-dependencies
diff --git a/compiler/dex/quick/x86/int_x86.cc b/compiler/dex/quick/x86/int_x86.cc
index 051c001..f4ae18f 100644
--- a/compiler/dex/quick/x86/int_x86.cc
+++ b/compiler/dex/quick/x86/int_x86.cc
@@ -456,10 +456,20 @@
   RegLocation rl_result = {kLocPhysReg, 0, 0, 0, 0, 0, 0, 0, 1, kVectorNotUsed,
                           r2, INVALID_REG, INVALID_SREG, INVALID_SREG};
 
-  // handle 0x80000000 / -1 special case.
-  LIR *minint_branch = 0;
-  if (imm == -1) {
+  // handle div/rem by 1 special case.
+  if (imm == 1) {
     if (is_div) {
+      // x / 1 == x.
+      StoreValue(rl_result, rl_src);
+    } else {
+      // x % 1 == 0.
+      LoadConstantNoClobber(r0, 0);
+      // For this case, return the result in EAX.
+      rl_result.low_reg = r0;
+    }
+  } else if (imm == -1) {  // handle 0x80000000 / -1 special case.
+    if (is_div) {
+      LIR *minint_branch = 0;
       LoadValueDirectFixed(rl_src, r0);
       OpRegImm(kOpCmp, r0, 0x80000000);
       minint_branch = NewLIR2(kX86Jcc8, 0, kX86CondEq);
@@ -479,7 +489,7 @@
     // For this case, return the result in EAX.
     rl_result.low_reg = r0;
   } else {
-    DCHECK(imm <= -2 || imm >= 2);
+    CHECK(imm <= -2 || imm >= 2);
     // Use H.S.Warren's Hacker's Delight Chapter 10 and
     // T,Grablund, P.L.Montogomery's Division by invariant integers using multiplication.
     int magic, shift;
diff --git a/compiler/driver/compiler_driver.cc b/compiler/driver/compiler_driver.cc
index 25d5a42..5b22817 100644
--- a/compiler/driver/compiler_driver.cc
+++ b/compiler/driver/compiler_driver.cc
@@ -980,12 +980,12 @@
 }
 
 static mirror::ArtField* ComputeFieldReferencedFromCompilingMethod(
-    ScopedObjectAccess& soa, const DexCompilationUnit* mUnit, uint32_t field_idx)
+    ScopedObjectAccess& soa, const DexCompilationUnit* mUnit, uint32_t field_idx, bool is_static)
     SHARED_LOCKS_REQUIRED(Locks::mutator_lock_) {
   SirtRef<mirror::DexCache> dex_cache(soa.Self(), mUnit->GetClassLinker()->FindDexCache(*mUnit->GetDexFile()));
   SirtRef<mirror::ClassLoader> class_loader(soa.Self(), soa.Decode<mirror::ClassLoader*>(mUnit->GetClassLoader()));
   return mUnit->GetClassLinker()->ResolveField(*mUnit->GetDexFile(), field_idx, dex_cache,
-                                               class_loader, false);
+                                               class_loader, is_static);
 }
 
 static mirror::ArtMethod* ComputeMethodReferencedFromCompilingMethod(ScopedObjectAccess& soa,
@@ -1030,7 +1030,8 @@
   *field_offset = -1;
   *is_volatile = true;
   // Try to resolve field and ignore if an Incompatible Class Change Error (ie is static).
-  mirror::ArtField* resolved_field = ComputeFieldReferencedFromCompilingMethod(soa, mUnit, field_idx);
+  mirror::ArtField* resolved_field =
+      ComputeFieldReferencedFromCompilingMethod(soa, mUnit, field_idx, false);
   if (resolved_field != NULL && !resolved_field->IsStatic()) {
     SirtRef<mirror::DexCache> dex_cache(soa.Self(),
                                         resolved_field->GetDeclaringClass()->GetDexCache());
@@ -1070,7 +1071,8 @@
   *is_volatile = true;
   *is_initialized = false;
   // Try to resolve field and ignore if an Incompatible Class Change Error (ie isn't static).
-  mirror::ArtField* resolved_field = ComputeFieldReferencedFromCompilingMethod(soa, mUnit, field_idx);
+  mirror::ArtField* resolved_field =
+      ComputeFieldReferencedFromCompilingMethod(soa, mUnit, field_idx, true);
   if (resolved_field != NULL && resolved_field->IsStatic()) {
     SirtRef<mirror::DexCache> dex_cache(soa.Self(), resolved_field->GetDeclaringClass()->GetDexCache());
     mirror::Class* referrer_class =
diff --git a/runtime/gc/accounting/atomic_stack.h b/runtime/gc/accounting/atomic_stack.h
index ea8f89c..d6f3228 100644
--- a/runtime/gc/accounting/atomic_stack.h
+++ b/runtime/gc/accounting/atomic_stack.h
@@ -73,6 +73,41 @@
     return true;
   }
 
+  // Atomically bump the back index by the given number of
+  // slots. Returns false if we overflowed the stack.
+  bool AtomicBumpBack(size_t num_slots, T** start_address, T** end_address) {
+    if (kIsDebugBuild) {
+      debug_is_sorted_ = false;
+    }
+    int32_t index;
+    int32_t new_index;
+    do {
+      index = back_index_;
+      new_index = index + num_slots;
+      if (UNLIKELY(static_cast<size_t>(new_index) >= capacity_)) {
+        // Stack overflow.
+        return false;
+      }
+    } while (!back_index_.CompareAndSwap(index, new_index));
+    *start_address = &begin_[index];
+    *end_address = &begin_[new_index];
+    if (kIsDebugBuild) {
+      // Sanity check that the memory is zero.
+      for (int32_t i = index; i < new_index; ++i) {
+        DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i << " index=" << index << " new_index=" << new_index;
+      }
+    }
+    return true;
+  }
+
+  void AssertAllZero() {
+    if (kIsDebugBuild) {
+      for (size_t i = 0; i < capacity_; ++i) {
+        DCHECK_EQ(begin_[i], static_cast<T>(0)) << "i=" << i;
+      }
+    }
+  }
+
   void PushBack(const T& value) {
     if (kIsDebugBuild) {
       debug_is_sorted_ = false;
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 65d4c44..d02b851 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -1560,7 +1560,8 @@
 void RosAlloc::RevokeAllThreadLocalRuns() {
   // This is called when a mutator thread won't allocate such as at
   // the Zygote creation time or during the GC pause.
-  MutexLock mu(Thread::Current(), *Locks::thread_list_lock_);
+  MutexLock mu(Thread::Current(), *Locks::runtime_shutdown_lock_);
+  MutexLock mu2(Thread::Current(), *Locks::thread_list_lock_);
   std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
   for (auto it = thread_list.begin(); it != thread_list.end(); ++it) {
     Thread* t = *it;
diff --git a/runtime/gc/collector/mark_sweep.cc b/runtime/gc/collector/mark_sweep.cc
index de9f59e..dbbc115 100644
--- a/runtime/gc/collector/mark_sweep.cc
+++ b/runtime/gc/collector/mark_sweep.cc
@@ -206,6 +206,10 @@
     // This second sweep makes sure that we don't have any objects in the live stack which point to
     // freed objects. These cause problems since their references may be previously freed objects.
     SweepArray(GetHeap()->allocation_stack_.get(), false);
+    // Since SweepArray() above resets the (active) allocation
+    // stack. Need to revoke the thread-local allocation stacks that
+    // point into it.
+    GetHeap()->RevokeAllThreadLocalAllocationStacks(self);
   }
 
   timings_.StartSplit("PreSweepingGcVerification");
@@ -241,12 +245,15 @@
   // Need to do this before the checkpoint since we don't want any threads to add references to
   // the live stack during the recursive mark.
   timings_.NewSplit("SwapStacks");
-  heap_->SwapStacks();
+  heap_->SwapStacks(self);
 
   WriterMutexLock mu(self, *Locks::heap_bitmap_lock_);
   if (Locks::mutator_lock_->IsExclusiveHeld(self)) {
     // If we exclusively hold the mutator lock, all threads must be suspended.
     MarkRoots();
+    if (kUseThreadLocalAllocationStack) {
+      heap_->RevokeAllThreadLocalAllocationStacks(self);
+    }
   } else {
     MarkThreadRoots(self);
     // At this point the live stack should no longer have any mutators which push into it.
@@ -995,6 +1002,9 @@
         << thread->GetState() << " thread " << thread << " self " << self;
     thread->VisitRoots(MarkSweep::MarkRootParallelCallback, mark_sweep_);
     ATRACE_END();
+    if (kUseThreadLocalAllocationStack) {
+      thread->RevokeThreadLocalAllocationStack();
+    }
     mark_sweep_->GetBarrier().Pass(self);
   }
 
@@ -1062,6 +1072,9 @@
     Object** out = objects;
     for (size_t i = 0; i < count; ++i) {
       Object* obj = objects[i];
+      if (kUseThreadLocalAllocationStack && obj == nullptr) {
+        continue;
+      }
       if (space->HasAddress(obj)) {
         // This object is in the space, remove it from the array and add it to the sweep buffer
         // if needed.
@@ -1100,6 +1113,9 @@
   for (size_t i = 0; i < count; ++i) {
     Object* obj = objects[i];
     // Handle large objects.
+    if (kUseThreadLocalAllocationStack && obj == nullptr) {
+      continue;
+    }
     if (!large_mark_objects->Test(obj)) {
       ++freed_large_objects;
       freed_large_object_bytes += large_object_space->Free(self, obj);
diff --git a/runtime/gc/collector/semi_space.cc b/runtime/gc/collector/semi_space.cc
index b37b9d2..b1122b9 100644
--- a/runtime/gc/collector/semi_space.cc
+++ b/runtime/gc/collector/semi_space.cc
@@ -150,6 +150,7 @@
   immune_begin_ = nullptr;
   immune_end_ = nullptr;
   is_large_object_space_immune_ = false;
+  saved_bytes_ = 0;
   self_ = Thread::Current();
   // Do any pre GC verification.
   timings_.NewSplit("PreGcVerification");
@@ -209,7 +210,10 @@
   // Need to do this before the checkpoint since we don't want any threads to add references to
   // the live stack during the recursive mark.
   timings_.NewSplit("SwapStacks");
-  heap_->SwapStacks();
+  if (kUseThreadLocalAllocationStack) {
+    heap_->RevokeAllThreadLocalAllocationStacks(self_);
+  }
+  heap_->SwapStacks(self_);
   WriterMutexLock mu(self_, *Locks::heap_bitmap_lock_);
   MarkRoots();
   // Mark roots of immune spaces.
@@ -361,6 +365,9 @@
   } else {
     mprotect(from_space_->Begin(), from_space_->Capacity(), PROT_READ);
   }
+  if (saved_bytes_ > 0) {
+    VLOG(heap) << "Avoided dirtying " << PrettySize(saved_bytes_);
+  }
 
   if (generational_) {
     // Record the end (top) of the to space so we can distinguish
@@ -400,6 +407,56 @@
   return false;
 }
 
+static inline size_t CopyAvoidingDirtyingPages(void* dest, const void* src, size_t size) {
+  if (LIKELY(size <= static_cast<size_t>(kPageSize))) {
+    // We will dirty the current page and somewhere in the middle of the next page. This means
+    // that the next object copied will also dirty that page.
+    // TODO: Worth considering the last object copied? We may end up dirtying one page which is
+    // not necessary per GC.
+    memcpy(dest, src, size);
+    return 0;
+  }
+  size_t saved_bytes = 0;
+  byte* byte_dest = reinterpret_cast<byte*>(dest);
+  if (kIsDebugBuild) {
+    for (size_t i = 0; i < size; ++i) {
+      CHECK_EQ(byte_dest[i], 0U);
+    }
+  }
+  // Process the start of the page. The page must already be dirty, don't bother with checking.
+  const byte* byte_src = reinterpret_cast<const byte*>(src);
+  const byte* limit = byte_src + size;
+  size_t page_remain = AlignUp(byte_dest, kPageSize) - byte_dest;
+  // Copy the bytes until the start of the next page.
+  memcpy(dest, src, page_remain);
+  byte_src += page_remain;
+  byte_dest += page_remain;
+  CHECK_ALIGNED(reinterpret_cast<uintptr_t>(byte_dest), kPageSize);
+  CHECK_ALIGNED(reinterpret_cast<uintptr_t>(byte_dest), sizeof(uintptr_t));
+  CHECK_ALIGNED(reinterpret_cast<uintptr_t>(byte_src), sizeof(uintptr_t));
+  while (byte_src + kPageSize < limit) {
+    bool all_zero = true;
+    uintptr_t* word_dest = reinterpret_cast<uintptr_t*>(byte_dest);
+    const uintptr_t* word_src = reinterpret_cast<const uintptr_t*>(byte_src);
+    for (size_t i = 0; i < kPageSize / sizeof(*word_src); ++i) {
+      // Assumes the destination of the copy is all zeros.
+      if (word_src[i] != 0) {
+        all_zero = false;
+        word_dest[i] = word_src[i];
+      }
+    }
+    if (all_zero) {
+      // Avoided copying into the page since it was all zeros.
+      saved_bytes += kPageSize;
+    }
+    byte_src += kPageSize;
+    byte_dest += kPageSize;
+  }
+  // Handle the part of the page at the end.
+  memcpy(byte_dest, byte_src, limit - byte_src);
+  return saved_bytes;
+}
+
 mirror::Object* SemiSpace::MarkNonForwardedObject(mirror::Object* obj) {
   size_t object_size = obj->SizeOf();
   size_t bytes_allocated;
@@ -458,7 +515,8 @@
   }
   // Copy over the object and add it to the mark stack since we still need to update its
   // references.
-  memcpy(reinterpret_cast<void*>(forward_address), obj, object_size);
+  saved_bytes_ +=
+      CopyAvoidingDirtyingPages(reinterpret_cast<void*>(forward_address), obj, object_size);
   if (to_space_live_bitmap_ != nullptr) {
     to_space_live_bitmap_->Set(forward_address);
   }
diff --git a/runtime/gc/collector/semi_space.h b/runtime/gc/collector/semi_space.h
index f58402f..89e2002 100644
--- a/runtime/gc/collector/semi_space.h
+++ b/runtime/gc/collector/semi_space.h
@@ -295,6 +295,9 @@
   // whole_heap_collection_ once per interval.
   int whole_heap_collection_interval_counter_;
 
+  // How many bytes we avoided dirtying.
+  size_t saved_bytes_;
+
   // Used for the generational mode. The default interval of the whole
   // heap collection. If N, the whole heap collection occurs every N
   // collections.
diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h
index 5e1136b..9c91b0e 100644
--- a/runtime/gc/heap-inl.h
+++ b/runtime/gc/heap-inl.h
@@ -82,11 +82,7 @@
     DCHECK(!Runtime::Current()->HasStatsEnabled());
   }
   if (AllocatorHasAllocationStack(allocator)) {
-    // This is safe to do since the GC will never free objects which are neither in the allocation
-    // stack or the live bitmap.
-    while (!allocation_stack_->AtomicPushBack(obj)) {
-      CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
-    }
+    PushOnAllocationStack(self, obj);
   }
   if (kInstrumented) {
     if (Dbg::IsAllocTrackingEnabled()) {
@@ -111,6 +107,35 @@
   return obj;
 }
 
+// The size of a thread-local allocation stack in the number of references.
+static constexpr size_t kThreadLocalAllocationStackSize = 128;
+
+inline void Heap::PushOnAllocationStack(Thread* self, mirror::Object* obj) {
+  if (kUseThreadLocalAllocationStack) {
+    bool success = self->PushOnThreadLocalAllocationStack(obj);
+    if (UNLIKELY(!success)) {
+      // Slow path. Allocate a new thread-local allocation stack.
+      mirror::Object** start_address;
+      mirror::Object** end_address;
+      while (!allocation_stack_->AtomicBumpBack(kThreadLocalAllocationStackSize,
+                                                &start_address, &end_address)) {
+        CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+      }
+      self->SetThreadLocalAllocationStack(start_address, end_address);
+      // Retry on the new thread-local allocation stack.
+      success = self->PushOnThreadLocalAllocationStack(obj);
+      // Must succeed.
+      CHECK(success);
+    }
+  } else {
+    // This is safe to do since the GC will never free objects which are neither in the allocation
+    // stack or the live bitmap.
+    while (!allocation_stack_->AtomicPushBack(obj)) {
+      CollectGarbageInternal(collector::kGcTypeSticky, kGcCauseForAlloc, false);
+    }
+  }
+}
+
 template <bool kInstrumented, typename PreFenceVisitor>
 inline mirror::Object* Heap::AllocLargeObject(Thread* self, mirror::Class* klass,
                                               size_t byte_count,
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index a324925..f1126ef 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -381,7 +381,12 @@
   for (mirror::Object** it = allocation_stack_->Begin(), **end = allocation_stack_->End();
       it < end; ++it) {
     mirror::Object* obj = *it;
-    callback(obj, arg);
+    if (obj != nullptr && obj->GetClass() != nullptr) {
+      // Avoid the race condition caused by the object not yet being written into the allocation
+      // stack or the class not yet being written in the object. Or, if kUseThreadLocalAllocationStack,
+      // there can be nulls on the allocation stack.
+      callback(obj, arg);
+    }
   }
   GetLiveBitmap()->Walk(callback, arg);
   self->EndAssertNoThreadSuspension(old_cause);
@@ -1529,13 +1534,14 @@
   mirror::Object** limit = stack->End();
   for (mirror::Object** it = stack->Begin(); it != limit; ++it) {
     const mirror::Object* obj = *it;
-    DCHECK(obj != nullptr);
-    if (bitmap1->HasAddress(obj)) {
-      bitmap1->Set(obj);
-    } else if (bitmap2->HasAddress(obj)) {
-      bitmap2->Set(obj);
-    } else {
-      large_objects->Set(obj);
+    if (!kUseThreadLocalAllocationStack || obj != nullptr) {
+      if (bitmap1->HasAddress(obj)) {
+        bitmap1->Set(obj);
+      } else if (bitmap2->HasAddress(obj)) {
+        bitmap2->Set(obj);
+      } else {
+        large_objects->Set(obj);
+      }
     }
   }
 }
@@ -2000,7 +2006,9 @@
 
   // We can verify objects in the live stack since none of these should reference dead objects.
   for (mirror::Object** it = live_stack_->Begin(); it != live_stack_->End(); ++it) {
-    visitor(*it);
+    if (!kUseThreadLocalAllocationStack || *it != nullptr) {
+      visitor(*it);
+    }
   }
 
   if (visitor.Failed()) {
@@ -2010,10 +2018,30 @@
   return true;
 }
 
-void Heap::SwapStacks() {
+void Heap::SwapStacks(Thread* self) {
+  if (kUseThreadLocalAllocationStack) {
+    live_stack_->AssertAllZero();
+  }
   allocation_stack_.swap(live_stack_);
 }
 
+void Heap::RevokeAllThreadLocalAllocationStacks(Thread* self) {
+  if (!Runtime::Current()->IsStarted()) {
+    // There's no thread list if the runtime hasn't started (eg
+    // dex2oat or a test). Just revoke for self.
+    self->RevokeThreadLocalAllocationStack();
+    return;
+  }
+  // This must be called only during the pause.
+  CHECK(Locks::mutator_lock_->IsExclusiveHeld(self));
+  MutexLock mu(self, *Locks::runtime_shutdown_lock_);
+  MutexLock mu2(self, *Locks::thread_list_lock_);
+  std::list<Thread*> thread_list = Runtime::Current()->GetThreadList()->GetList();
+  for (Thread* t : thread_list) {
+    t->RevokeThreadLocalAllocationStack();
+  }
+}
+
 accounting::ModUnionTable* Heap::FindModUnionTableFromSpace(space::Space* space) {
   auto it = mod_union_tables_.find(space);
   if (it == mod_union_tables_.end()) {
@@ -2068,12 +2096,12 @@
     thread_list->SuspendAll();
     {
       ReaderMutexLock mu(self, *Locks::heap_bitmap_lock_);
-      SwapStacks();
+      SwapStacks(self);
       // Sort the live stack so that we can quickly binary search it later.
       if (!VerifyMissingCardMarks()) {
         LOG(FATAL) << "Pre " << gc->GetName() << " missing card mark verification failed";
       }
-      SwapStacks();
+      SwapStacks(self);
     }
     thread_list->ResumeAll();
   }
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index e416c0e..80a5a1a 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -111,6 +111,9 @@
 // If true, use rosalloc/RosAllocSpace instead of dlmalloc/DlMallocSpace
 static constexpr bool kUseRosAlloc = true;
 
+// If true, use thread-local allocation stack.
+static constexpr bool kUseThreadLocalAllocationStack = true;
+
 // The process state passed in from the activity manager, used to determine when to do trimming
 // and compaction.
 enum ProcessState {
@@ -665,11 +668,17 @@
       SHARED_LOCKS_REQUIRED(GlobalSychronization::heap_bitmap_lock_);
 
   // Swap the allocation stack with the live stack.
-  void SwapStacks();
+  void SwapStacks(Thread* self);
+
+  // Revoke all the thread-local allocation stacks.
+  void RevokeAllThreadLocalAllocationStacks(Thread* self);
 
   // Clear cards and update the mod union table.
   void ProcessCards(TimingLogger& timings);
 
+  // Push an object onto the allocation stack.
+  void PushOnAllocationStack(Thread* self, mirror::Object* obj);
+
   // All-known continuous spaces, where objects lie within fixed bounds.
   std::vector<space::ContinuousSpace*> continuous_spaces_;
 
diff --git a/runtime/gc/space/bump_pointer_space.cc b/runtime/gc/space/bump_pointer_space.cc
index f7bdc4c..f3f594f 100644
--- a/runtime/gc/space/bump_pointer_space.cc
+++ b/runtime/gc/space/bump_pointer_space.cc
@@ -137,6 +137,7 @@
 
 void BumpPointerSpace::Walk(ObjectCallback* callback, void* arg) {
   byte* pos = Begin();
+  byte* end = End();
   byte* main_end = pos;
   {
     MutexLock mu(Thread::Current(), block_lock_);
@@ -145,16 +146,29 @@
     if (num_blocks_ == 0) {
       UpdateMainBlock();
     }
-    main_end += main_block_size_;
+    main_end = Begin() + main_block_size_;
+    if (num_blocks_ == 0) {
+      // We don't have any other blocks, this means someone else may be allocating into the main
+      // block. In this case, we don't want to try and visit the other blocks after the main block
+      // since these could actually be part of the main block.
+      end = main_end;
+    }
   }
   // Walk all of the objects in the main block first.
   while (pos < main_end) {
     mirror::Object* obj = reinterpret_cast<mirror::Object*>(pos);
-    callback(obj, arg);
-    pos = reinterpret_cast<byte*>(GetNextObject(obj));
+    if (obj->GetClass() == nullptr) {
+      // There is a race condition where a thread has just allocated an object but not set the
+      // class. We can't know the size of this object, so we don't visit it and exit the function
+      // since there is guaranteed to be not other blocks.
+      return;
+    } else {
+      callback(obj, arg);
+      pos = reinterpret_cast<byte*>(GetNextObject(obj));
+    }
   }
   // Walk the other blocks (currently only TLABs).
-  while (pos < End()) {
+  while (pos < end) {
     BlockHeader* header = reinterpret_cast<BlockHeader*>(pos);
     size_t block_size = header->size_;
     pos += sizeof(BlockHeader);  // Skip the header so that we know where the objects
diff --git a/runtime/jni_internal.cc b/runtime/jni_internal.cc
index 6f31ca7..94ae89e 100644
--- a/runtime/jni_internal.cc
+++ b/runtime/jni_internal.cc
@@ -2029,7 +2029,7 @@
   }
 
   static const jchar* GetStringChars(JNIEnv* env, jstring java_string, jboolean* is_copy) {
-    CHECK_NON_NULL_ARGUMENT(GetStringUTFRegion, java_string);
+    CHECK_NON_NULL_ARGUMENT(GetStringChars, java_string);
     ScopedObjectAccess soa(env);
     String* s = soa.Decode<String*>(java_string);
     CharArray* chars = s->GetCharArray();
@@ -2048,7 +2048,7 @@
   }
 
   static void ReleaseStringChars(JNIEnv* env, jstring java_string, const jchar* chars) {
-    CHECK_NON_NULL_ARGUMENT(GetStringUTFRegion, java_string);
+    CHECK_NON_NULL_ARGUMENT(ReleaseStringChars, java_string);
     delete[] chars;
     ScopedObjectAccess soa(env);
     UnpinPrimitiveArray(soa, soa.Decode<String*>(java_string)->GetCharArray());
diff --git a/runtime/native/dalvik_system_Zygote.cc b/runtime/native/dalvik_system_Zygote.cc
index 7e6432e..29c0bc0 100644
--- a/runtime/native/dalvik_system_Zygote.cc
+++ b/runtime/native/dalvik_system_Zygote.cc
@@ -566,7 +566,7 @@
   return pid;
 }
 
-static jint Zygote_nativeForkAndSpecialize_new(JNIEnv* env, jclass, jint uid, jint gid, jintArray gids,
+static jint Zygote_nativeForkAndSpecialize(JNIEnv* env, jclass, jint uid, jint gid, jintArray gids,
                                            jint debug_flags, jobjectArray rlimits,
                                            jint mount_external, jstring se_info, jstring se_name,
                                            jintArray fdsToClose) {
@@ -574,13 +574,6 @@
                                  se_info, se_name, false, fdsToClose);
 }
 
-static jint Zygote_nativeForkAndSpecialize(JNIEnv* env, jclass, jint uid, jint gid, jintArray gids,
-                                           jint debug_flags, jobjectArray rlimits,
-                                           jint mount_external, jstring se_info, jstring se_name) {
-  return ForkAndSpecializeCommon(env, uid, gid, gids, debug_flags, rlimits, 0, 0, mount_external,
-                                 se_info, se_name, false, NULL);
-}
-
 static jint Zygote_nativeForkSystemServer(JNIEnv* env, jclass, uid_t uid, gid_t gid, jintArray gids,
                                           jint debug_flags, jobjectArray rlimits,
                                           jlong permittedCapabilities,
@@ -605,8 +598,7 @@
 }
 
 static JNINativeMethod gMethods[] = {
-  NATIVE_METHOD(Zygote, nativeForkAndSpecialize_new, "(II[II[[IILjava/lang/String;Ljava/lang/String;[I)I"),
-  NATIVE_METHOD(Zygote, nativeForkAndSpecialize, "(II[II[[IILjava/lang/String;Ljava/lang/String;)I"),
+  NATIVE_METHOD(Zygote, nativeForkAndSpecialize, "(II[II[[IILjava/lang/String;Ljava/lang/String;[I)I"),
   NATIVE_METHOD(Zygote, nativeForkSystemServer, "(II[II[[IJJ)I"),
 };
 
diff --git a/runtime/thread-inl.h b/runtime/thread-inl.h
index 9420e7b..c0bf377 100644
--- a/runtime/thread-inl.h
+++ b/runtime/thread-inl.h
@@ -170,6 +170,42 @@
   return ret;
 }
 
+inline bool Thread::PushOnThreadLocalAllocationStack(mirror::Object* obj) {
+  DCHECK_LE(thread_local_alloc_stack_top_, thread_local_alloc_stack_end_);
+  if (thread_local_alloc_stack_top_ < thread_local_alloc_stack_end_) {
+    // There's room.
+    DCHECK_LE(reinterpret_cast<byte*>(thread_local_alloc_stack_top_) + sizeof(mirror::Object*),
+              reinterpret_cast<byte*>(thread_local_alloc_stack_end_));
+    DCHECK(*thread_local_alloc_stack_top_ == nullptr);
+    *thread_local_alloc_stack_top_ = obj;
+    ++thread_local_alloc_stack_top_;
+    return true;
+  }
+  return false;
+}
+
+inline void Thread::SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end) {
+  DCHECK(Thread::Current() == this) << "Should be called by self";
+  DCHECK(start != nullptr);
+  DCHECK(end != nullptr);
+  DCHECK_ALIGNED(start, sizeof(mirror::Object*));
+  DCHECK_ALIGNED(end, sizeof(mirror::Object*));
+  DCHECK_LT(start, end);
+  thread_local_alloc_stack_end_ = end;
+  thread_local_alloc_stack_top_ = start;
+}
+
+inline void Thread::RevokeThreadLocalAllocationStack() {
+  if (kIsDebugBuild) {
+    // Note: self is not necessarily equal to this thread since thread may be suspended.
+    Thread* self = Thread::Current();
+    DCHECK(this == self || IsSuspended() || GetState() == kWaitingPerformingGc)
+        << GetState() << " thread " << this << " self " << self;
+  }
+  thread_local_alloc_stack_end_ = nullptr;
+  thread_local_alloc_stack_top_ = nullptr;
+}
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_THREAD_INL_H_
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 9797a48..3382811 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -963,7 +963,9 @@
       thread_local_start_(nullptr),
       thread_local_pos_(nullptr),
       thread_local_end_(nullptr),
-      thread_local_objects_(0) {
+      thread_local_objects_(0),
+      thread_local_alloc_stack_top_(nullptr),
+      thread_local_alloc_stack_end_(nullptr) {
   CHECK_EQ((sizeof(Thread) % 4), 0U) << sizeof(Thread);
   state_and_flags_.as_struct.flags = 0;
   state_and_flags_.as_struct.state = kNative;
diff --git a/runtime/thread.h b/runtime/thread.h
index a3a77bb..6c072ba 100644
--- a/runtime/thread.h
+++ b/runtime/thread.h
@@ -829,6 +829,19 @@
   static const size_t kRosAllocNumOfSizeBrackets = 34;
   void* rosalloc_runs_[kRosAllocNumOfSizeBrackets];
 
+  // Thread-local allocation stack data/routines.
+  mirror::Object** thread_local_alloc_stack_top_;
+  mirror::Object** thread_local_alloc_stack_end_;
+
+  // Push an object onto the allocation stack.
+  bool PushOnThreadLocalAllocationStack(mirror::Object* obj);
+
+  // Set the thread local allocation pointers to the given pointers.
+  void SetThreadLocalAllocationStack(mirror::Object** start, mirror::Object** end);
+
+  // Resets the thread local allocation pointers.
+  void RevokeThreadLocalAllocationStack();
+
  private:
   friend class Dbg;  // For SetStateUnsafe.
   friend class Monitor;