Merge "Revert "Enable store elimination for singleton objects.""
diff --git a/compiler/common_compiler_test.cc b/compiler/common_compiler_test.cc
index 151437b..c37ceca 100644
--- a/compiler/common_compiler_test.cc
+++ b/compiler/common_compiler_test.cc
@@ -77,11 +77,10 @@
 
     header_code_and_maps_chunks_.push_back(std::vector<uint8_t>());
     std::vector<uint8_t>* chunk = &header_code_and_maps_chunks_.back();
-    size_t size = sizeof(method_header) + code_size + vmap_table.size() + mapping_table_size +
-        gc_map_size;
-    size_t code_offset = compiled_method->AlignCode(size - code_size);
-    size_t padding = code_offset - (size - code_size);
-    chunk->reserve(padding + size);
+    const size_t max_padding = GetInstructionSetAlignment(compiled_method->GetInstructionSet());
+    const size_t size =
+        gc_map_size + mapping_table_size + vmap_table.size() + sizeof(method_header) + code_size;
+    chunk->reserve(size + max_padding);
     chunk->resize(sizeof(method_header));
     memcpy(&(*chunk)[0], &method_header, sizeof(method_header));
     chunk->insert(chunk->begin(), vmap_table.begin(), vmap_table.end());
@@ -91,10 +90,16 @@
     if (gc_map_used) {
       chunk->insert(chunk->begin(), gc_map.begin(), gc_map.end());
     }
-    chunk->insert(chunk->begin(), padding, 0);
     chunk->insert(chunk->end(), code.begin(), code.end());
-    CHECK_EQ(padding + size, chunk->size());
-    const void* code_ptr = &(*chunk)[code_offset];
+    CHECK_EQ(chunk->size(), size);
+    const void* unaligned_code_ptr = chunk->data() + (size - code_size);
+    size_t offset = dchecked_integral_cast<size_t>(reinterpret_cast<uintptr_t>(unaligned_code_ptr));
+    size_t padding = compiled_method->AlignCode(offset) - offset;
+    // Make sure no resizing takes place.
+    CHECK_GE(chunk->capacity(), chunk->size() + padding);
+    chunk->insert(chunk->begin(), padding, 0);
+    const void* code_ptr = reinterpret_cast<const uint8_t*>(unaligned_code_ptr) + padding;
+    CHECK_EQ(code_ptr, static_cast<const void*>(chunk->data() + (chunk->size() - code_size)));
     MakeExecutable(code_ptr, code.size());
     const void* method_code = CompiledMethod::CodePointer(code_ptr,
                                                           compiled_method->GetInstructionSet());
diff --git a/compiler/optimizing/code_generator_mips.cc b/compiler/optimizing/code_generator_mips.cc
index 6aed444..e6b9273 100644
--- a/compiler/optimizing/code_generator_mips.cc
+++ b/compiler/optimizing/code_generator_mips.cc
@@ -3118,15 +3118,25 @@
 }
 
 void LocationsBuilderMIPS::VisitLoadClass(HLoadClass* cls) {
-  LocationSummary::CallKind call_kind = cls->CanCallRuntime() ? LocationSummary::kCallOnSlowPath
-                                                              : LocationSummary::kNoCall;
-  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(cls, call_kind);
-  locations->SetInAt(0, Location::RequiresRegister());
-  locations->SetOut(Location::RequiresRegister());
+  InvokeRuntimeCallingConvention calling_convention;
+  CodeGenerator::CreateLoadClassLocationSummary(
+      cls,
+      Location::RegisterLocation(calling_convention.GetRegisterAt(0)),
+      Location::RegisterLocation(V0));
 }
 
 void InstructionCodeGeneratorMIPS::VisitLoadClass(HLoadClass* cls) {
   LocationSummary* locations = cls->GetLocations();
+  if (cls->NeedsAccessCheck()) {
+    codegen_->MoveConstant(locations->GetTemp(0), cls->GetTypeIndex());
+    codegen_->InvokeRuntime(QUICK_ENTRY_POINT(pInitializeTypeAndVerifyAccess),
+                            cls,
+                            cls->GetDexPc(),
+                            nullptr,
+                            IsDirectEntrypoint(kQuickInitializeTypeAndVerifyAccess));
+    return;
+  }
+
   Register out = locations->Out().AsRegister<Register>();
   Register current_method = locations->InAt(0).AsRegister<Register>();
   if (cls->IsReferrersClass()) {
diff --git a/compiler/optimizing/induction_var_analysis.cc b/compiler/optimizing/induction_var_analysis.cc
index 8968a44..b6220fc 100644
--- a/compiler/optimizing/induction_var_analysis.cc
+++ b/compiler/optimizing/induction_var_analysis.cc
@@ -20,14 +20,14 @@
 namespace art {
 
 /**
- * Returns true if instruction is invariant within the given loop.
+ * Returns true if instruction is invariant within the given loop
+ * (instruction is not defined in same or more inner loop).
  */
 static bool IsLoopInvariant(HLoopInformation* loop, HInstruction* instruction) {
   HLoopInformation* other_loop = instruction->GetBlock()->GetLoopInformation();
-  if (other_loop != loop) {
-    // If instruction does not occur in same loop, it is invariant
-    // if it appears in an outer loop (including no loop at all).
-    return other_loop == nullptr || loop->IsIn(*other_loop);
+  if (other_loop != loop && (other_loop == nullptr || !other_loop->IsIn(*loop))) {
+    DCHECK(instruction->GetBlock()->Dominates(loop->GetHeader()));
+    return true;
   }
   return false;
 }
@@ -601,15 +601,16 @@
   //     an unsigned entity, for example, as in the following loop that uses the full range:
   //     for (int i = INT_MIN; i < INT_MAX; i++) // TC = UINT_MAX
   // (2) The TC is only valid if the loop is taken, otherwise TC = 0, as in:
-  //     for (int i = 12; i < U; i++) // TC = 0 when U >= 12
+  //     for (int i = 12; i < U; i++) // TC = 0 when U < 12
   //     If this cannot be determined at compile-time, the TC is only valid within the
-  //     loop-body proper, not the loop-header unless enforced with an explicit condition.
+  //     loop-body proper, not the loop-header unless enforced with an explicit taken-test.
   // (3) The TC is only valid if the loop is finite, otherwise TC has no value, as in:
   //     for (int i = 0; i <= U; i++) // TC = Inf when U = INT_MAX
   //     If this cannot be determined at compile-time, the TC is only valid when enforced
-  //     with an explicit condition.
+  //     with an explicit finite-test.
   // (4) For loops which early-exits, the TC forms an upper bound, as in:
   //     for (int i = 0; i < 10 && ....; i++) // TC <= 10
+  InductionInfo* trip_count = upper_expr;
   const bool is_taken = IsTaken(lower_expr, upper_expr, cmp);
   const bool is_finite = IsFinite(upper_expr, stride_value, type, cmp);
   const bool cancels = (cmp == kCondLT || cmp == kCondGT) && std::abs(stride_value) == 1;
@@ -617,26 +618,36 @@
     // Convert exclusive integral inequality into inclusive integral inequality,
     // viz. condition i < U is i <= U - 1 and condition i > U is i >= U + 1.
     if (cmp == kCondLT) {
-      upper_expr = CreateInvariantOp(kSub, upper_expr, CreateConstant(1, type));
+      trip_count = CreateInvariantOp(kSub, trip_count, CreateConstant(1, type));
     } else if (cmp == kCondGT) {
-      upper_expr = CreateInvariantOp(kAdd, upper_expr, CreateConstant(1, type));
+      trip_count = CreateInvariantOp(kAdd, trip_count, CreateConstant(1, type));
     }
     // Compensate for stride.
-    upper_expr = CreateInvariantOp(kAdd, upper_expr, stride);
+    trip_count = CreateInvariantOp(kAdd, trip_count, stride);
   }
-  InductionInfo* trip_count
-      = CreateInvariantOp(kDiv, CreateInvariantOp(kSub, upper_expr, lower_expr), stride);
+  trip_count = CreateInvariantOp(kDiv, CreateInvariantOp(kSub, trip_count, lower_expr), stride);
   // Assign the trip-count expression to the loop control. Clients that use the information
   // should be aware that the expression is only valid under the conditions listed above.
-  InductionOp tcKind = kTripCountInBodyUnsafe;
+  InductionOp tcKind = kTripCountInBodyUnsafe;  // needs both tests
   if (is_taken && is_finite) {
-    tcKind = kTripCountInLoop;
+    tcKind = kTripCountInLoop;  // needs neither test
   } else if (is_finite) {
-    tcKind = kTripCountInBody;
+    tcKind = kTripCountInBody;  // needs taken-test
   } else if (is_taken) {
-    tcKind = kTripCountInLoopUnsafe;
+    tcKind = kTripCountInLoopUnsafe;  // needs finite-test
   }
-  AssignInfo(loop, loop->GetHeader()->GetLastInstruction(), CreateTripCount(tcKind, trip_count));
+  InductionOp op = kNop;
+  switch (cmp) {
+    case kCondLT: op = kLT; break;
+    case kCondLE: op = kLE; break;
+    case kCondGT: op = kGT; break;
+    case kCondGE: op = kGE; break;
+    default:      LOG(FATAL) << "CONDITION UNREACHABLE";
+  }
+  InductionInfo* taken_test = CreateInvariantOp(op, lower_expr, upper_expr);
+  AssignInfo(loop,
+             loop->GetHeader()->GetLastInstruction(),
+             CreateTripCount(tcKind, trip_count, taken_test));
 }
 
 bool HInductionVarAnalysis::IsTaken(InductionInfo* lower_expr,
@@ -829,12 +840,16 @@
       std::string inv = "(";
       inv += InductionToString(info->op_a);
       switch (info->operation) {
-        case kNop:   inv += " @ "; break;
-        case kAdd:   inv += " + "; break;
+        case kNop:   inv += " @ ";  break;
+        case kAdd:   inv += " + ";  break;
         case kSub:
-        case kNeg:   inv += " - "; break;
-        case kMul:   inv += " * "; break;
-        case kDiv:   inv += " / "; break;
+        case kNeg:   inv += " - ";  break;
+        case kMul:   inv += " * ";  break;
+        case kDiv:   inv += " / ";  break;
+        case kLT:    inv += " < ";  break;
+        case kLE:    inv += " <= "; break;
+        case kGT:    inv += " > ";  break;
+        case kGE:    inv += " >= "; break;
         case kFetch:
           DCHECK(info->fetch);
           if (IsIntAndGet(info, &value)) {
@@ -843,10 +858,10 @@
             inv += std::to_string(info->fetch->GetId()) + ":" + info->fetch->DebugName();
           }
           break;
-        case kTripCountInLoop:       inv += "TC-loop:"; break;
-        case kTripCountInBody:       inv += "TC-body:"; break;
-        case kTripCountInLoopUnsafe: inv += "TC-loop-unsafe:"; break;
-        case kTripCountInBodyUnsafe: inv += "TC-body-unsafe:"; break;
+        case kTripCountInLoop:       inv += " (TC-loop) ";        break;
+        case kTripCountInBody:       inv += " (TC-body) ";        break;
+        case kTripCountInLoopUnsafe: inv += " (TC-loop-unsafe) "; break;
+        case kTripCountInBodyUnsafe: inv += " (TC-body-unsafe) "; break;
       }
       inv += InductionToString(info->op_b);
       return inv + ")";
diff --git a/compiler/optimizing/induction_var_analysis.h b/compiler/optimizing/induction_var_analysis.h
index 7ab80cd..cf35409 100644
--- a/compiler/optimizing/induction_var_analysis.h
+++ b/compiler/optimizing/induction_var_analysis.h
@@ -65,11 +65,16 @@
     kMul,
     kDiv,
     kFetch,
-    // Trip counts (valid in full loop or only body proper; unsafe implies loop may be infinite).
-    kTripCountInLoop,
-    kTripCountInBody,
-    kTripCountInLoopUnsafe,
-    kTripCountInBodyUnsafe
+    // Trip-counts.
+    kTripCountInLoop,        // valid in full loop; loop is finite
+    kTripCountInBody,        // valid in body only; loop is finite
+    kTripCountInLoopUnsafe,  // valid in full loop; loop may be infinite
+    kTripCountInBodyUnsafe,  // valid in body only; loop may be infinite
+    // Comparisons for trip-count tests.
+    kLT,
+    kLE,
+    kGT,
+    kGE
   };
 
   /**
@@ -85,7 +90,7 @@
    *   (4) periodic
    *         nop: a, then defined by b (repeated when exhausted)
    *   (5) trip-count:
-   *         tc: defined by b
+   *         tc: defined by a, taken-test in b
    */
   struct InductionInfo : public ArenaObject<kArenaAllocInductionVarAnalysis> {
     InductionInfo(InductionClass ic,
@@ -119,8 +124,9 @@
     return new (graph_->GetArena()) InductionInfo(kInvariant, kFetch, nullptr, nullptr, f);
   }
 
-  InductionInfo* CreateTripCount(InductionOp op, InductionInfo* b) {
-    return new (graph_->GetArena()) InductionInfo(kInvariant, op, nullptr, b, nullptr);
+  InductionInfo* CreateTripCount(InductionOp op, InductionInfo* a, InductionInfo* b) {
+    DCHECK(a != nullptr);
+    return new (graph_->GetArena()) InductionInfo(kInvariant, op, a, b, nullptr);
   }
 
   InductionInfo* CreateInduction(InductionClass ic, InductionInfo* a, InductionInfo* b) {
diff --git a/compiler/optimizing/induction_var_analysis_test.cc b/compiler/optimizing/induction_var_analysis_test.cc
index f16da2a..b7262f6 100644
--- a/compiler/optimizing/induction_var_analysis_test.cc
+++ b/compiler/optimizing/induction_var_analysis_test.cc
@@ -234,7 +234,7 @@
   EXPECT_STREQ("((1) * i + (1))", GetInductionInfo(increment_[0], 0).c_str());
 
   // Trip-count.
-  EXPECT_STREQ("(TC-loop:(100))",
+  EXPECT_STREQ("((100) (TC-loop) ((0) < (100)))",
                GetInductionInfo(loop_header_[0]->GetLastInstruction(), 0).c_str());
 }
 
@@ -552,7 +552,7 @@
     }
     EXPECT_STREQ("((1) * i + (1))", GetInductionInfo(increment_[d], d).c_str());
     // Trip-count.
-    EXPECT_STREQ("(TC-loop:(100))",
+    EXPECT_STREQ("((100) (TC-loop) ((0) < (100)))",
                  GetInductionInfo(loop_header_[d]->GetLastInstruction(), d).c_str());
   }
 }
diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc
index f4842f9..5530d26 100644
--- a/compiler/optimizing/induction_var_range.cc
+++ b/compiler/optimizing/induction_var_range.cc
@@ -152,7 +152,7 @@
     }
   } else if (is_min) {
     // Special case for finding minimum: minimum of trip-count in loop-body is 1.
-    if (trip != nullptr && in_body && instruction == trip->op_b->fetch) {
+    if (trip != nullptr && in_body && instruction == trip->op_a->fetch) {
       return Value(1);
     }
   }
@@ -185,14 +185,14 @@
             return GetFetch(info->fetch, trip, in_body, is_min);
           case HInductionVarAnalysis::kTripCountInLoop:
             if (!in_body && !is_min) {  // one extra!
-              return GetVal(info->op_b, trip, in_body, is_min);
+              return GetVal(info->op_a, trip, in_body, is_min);
             }
             FALLTHROUGH_INTENDED;
           case HInductionVarAnalysis::kTripCountInBody:
             if (is_min) {
               return Value(0);
             } else if (in_body) {
-              return SubValue(GetVal(info->op_b, trip, in_body, is_min), Value(1));
+              return SubValue(GetVal(info->op_a, trip, in_body, is_min), Value(1));
             }
             break;
           default:
@@ -428,7 +428,7 @@
             return true;
           case HInductionVarAnalysis::kTripCountInLoop:
             if (!in_body && !is_min) {  // one extra!
-              return GenerateCode(info->op_b, trip, graph, block, result, in_body, is_min);
+              return GenerateCode(info->op_a, trip, graph, block, result, in_body, is_min);
             }
             FALLTHROUGH_INTENDED;
           case HInductionVarAnalysis::kTripCountInBody:
@@ -438,7 +438,7 @@
               }
               return true;
             } else if (in_body) {
-              if (GenerateCode(info->op_b, trip, graph, block, &opb, in_body, is_min)) {
+              if (GenerateCode(info->op_a, trip, graph, block, &opb, in_body, is_min)) {
                 if (graph != nullptr) {
                   *result = Insert(block,
                                    new (graph->GetArena())
diff --git a/compiler/optimizing/induction_var_range_test.cc b/compiler/optimizing/induction_var_range_test.cc
index 8fbc59f..ce8926a 100644
--- a/compiler/optimizing/induction_var_range_test.cc
+++ b/compiler/optimizing/induction_var_range_test.cc
@@ -125,7 +125,7 @@
 
   /** Constructs a trip-count. */
   HInductionVarAnalysis::InductionInfo* CreateTripCount(int32_t tc) {
-    return iva_->CreateTripCount(HInductionVarAnalysis::kTripCountInLoop, CreateConst(tc));
+    return iva_->CreateTripCount(HInductionVarAnalysis::kTripCountInLoop, CreateConst(tc), nullptr);
   }
 
   /** Constructs a linear a * i + b induction. */
diff --git a/compiler/optimizing/reference_type_propagation.cc b/compiler/optimizing/reference_type_propagation.cc
index 26a05da..659da06 100644
--- a/compiler/optimizing/reference_type_propagation.cc
+++ b/compiler/optimizing/reference_type_propagation.cc
@@ -373,12 +373,18 @@
   if (instr->IsInvokeStaticOrDirect() && instr->AsInvokeStaticOrDirect()->IsStringInit()) {
     // Calls to String.<init> are replaced with a StringFactory.
     if (kIsDebugBuild) {
-      ScopedObjectAccess soa(Thread::Current());
+      HInvoke* invoke = instr->AsInvoke();
       ClassLinker* cl = Runtime::Current()->GetClassLinker();
-      mirror::DexCache* dex_cache = cl->FindDexCache(
-          soa.Self(), instr->AsInvoke()->GetDexFile(), false);
-      ArtMethod* method = dex_cache->GetResolvedMethod(
-          instr->AsInvoke()->GetDexMethodIndex(), cl->GetImagePointerSize());
+      ScopedObjectAccess soa(Thread::Current());
+      StackHandleScope<2> hs(soa.Self());
+      Handle<mirror::DexCache> dex_cache(
+          hs.NewHandle(cl->FindDexCache(soa.Self(), invoke->GetDexFile(), false)));
+      // Use a null loader. We should probably use the compiling method's class loader,
+      // but then we would need to pass it to RTPVisitor just for this debug check. Since
+      // the method is from the String class, the null loader is good enough.
+      Handle<mirror::ClassLoader> loader;
+      ArtMethod* method = cl->ResolveMethod(
+          invoke->GetDexFile(), invoke->GetDexMethodIndex(), dex_cache, loader, nullptr, kDirect);
       DCHECK(method != nullptr);
       mirror::Class* declaring_class = method->GetDeclaringClass();
       DCHECK(declaring_class != nullptr);
diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S
index be5a15e..9ccabad 100644
--- a/runtime/arch/arm64/quick_entrypoints_arm64.S
+++ b/runtime/arch/arm64/quick_entrypoints_arm64.S
@@ -1437,7 +1437,107 @@
 ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
 
 // Generate the allocation entrypoints for each allocator.
-GENERATE_ALL_ALLOC_ENTRYPOINTS
+GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
+GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
+ENTRY art_quick_alloc_object_rosalloc
+    // Fast path rosalloc allocation.
+    // x0: type_idx/return value, x1: ArtMethod*, xSELF(x19): Thread::Current
+    // x2-x7: free.
+    ldr    x2, [x1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_64]    // Load dex cache resolved types array
+                                                              // Load the class (x2)
+    ldr    w2, [x2, x0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
+    cbz    x2, .Lart_quick_alloc_object_rosalloc_slow_path    // Check null class
+                                                              // Check class status.
+    ldr    w3, [x2, #MIRROR_CLASS_STATUS_OFFSET]
+    cmp    x3, #MIRROR_CLASS_STATUS_INITIALIZED
+    bne    .Lart_quick_alloc_object_rosalloc_slow_path
+                                                              // Add a fake dependence from the
+                                                              // following access flag and size
+                                                              // loads to the status load.
+                                                              // This is to prevent those loads
+                                                              // from being reordered above the
+                                                              // status load and reading wrong
+                                                              // values (an alternative is to use
+                                                              // a load-acquire for the status).
+    eor    x3, x3, x3
+    add    x2, x2, x3
+                                                              // Check access flags has
+                                                              // kAccClassIsFinalizable
+    ldr    w3, [x2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
+    tst    x3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
+    bne    .Lart_quick_alloc_object_rosalloc_slow_path
+    ldr    x3, [xSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]  // Check if the thread local
+                                                              // allocation stack has room.
+                                                              // ldp won't work due to large offset.
+    ldr    x4, [xSELF, #THREAD_LOCAL_ALLOC_STACK_END_OFFSET]
+    cmp    x3, x4
+    bhs    .Lart_quick_alloc_object_rosalloc_slow_path
+    ldr    w3, [x2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (x3)
+    cmp    x3, #ROSALLOC_MAX_THREAD_LOCAL_BRACKET_SIZE        // Check if the size is for a thread
+                                                              // local allocation
+    bhs    .Lart_quick_alloc_object_rosalloc_slow_path
+                                                              // Compute the rosalloc bracket index
+                                                              // from the size.
+                                                              // Align up the size by the rosalloc
+                                                              // bracket quantum size and divide
+                                                              // by the quantum size and subtract
+                                                              // by 1. This code is a shorter but
+                                                              // equivalent version.
+    sub    x3, x3, #1
+    lsr    x3, x3, #ROSALLOC_BRACKET_QUANTUM_SIZE_SHIFT
+                                                              // Load the rosalloc run (x4)
+    add    x4, xSELF, x3, lsl #POINTER_SIZE_SHIFT
+    ldr    x4, [x4, #THREAD_ROSALLOC_RUNS_OFFSET]
+                                                              // Load the free list head (x3). This
+                                                              // will be the return val.
+    ldr    x3, [x4, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
+    cbz    x3, .Lart_quick_alloc_object_rosalloc_slow_path
+    // "Point of no slow path". Won't go to the slow path from here on. OK to clobber x0 and x1.
+    ldr    x1, [x3, #ROSALLOC_SLOT_NEXT_OFFSET]               // Load the next pointer of the head
+                                                              // and update the list head with the
+                                                              // next pointer.
+    str    x1, [x4, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_HEAD_OFFSET)]
+                                                              // Store the class pointer in the
+                                                              // header. This also overwrites the
+                                                              // next pointer. The offsets are
+                                                              // asserted to match.
+#if ROSALLOC_SLOT_NEXT_OFFSET != MIRROR_OBJECT_CLASS_OFFSET
+#error "Class pointer needs to overwrite next pointer."
+#endif
+    POISON_HEAP_REF w2
+    str    w2, [x3, #MIRROR_OBJECT_CLASS_OFFSET]
+                                                              // Push the new object onto the thread
+                                                              // local allocation stack and
+                                                              // increment the thread local
+                                                              // allocation stack top.
+    ldr    x1, [xSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
+    str    w3, [x1], #COMPRESSED_REFERENCE_SIZE               // (Increment x1 as a side effect.)
+    str    x1, [xSELF, #THREAD_LOCAL_ALLOC_STACK_TOP_OFFSET]
+                                                              // Decrement the size of the free list
+    ldr    w1, [x4, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)]
+    sub    x1, x1, #1
+                                                              // TODO: consider combining this store
+                                                              // and the list head store above using
+                                                              // strd.
+    str    w1, [x4, #(ROSALLOC_RUN_FREE_LIST_OFFSET + ROSALLOC_RUN_FREE_LIST_SIZE_OFFSET)]
+                                                              // Fence. This is "ish" not "ishst" so
+                                                              // that the code after this allocation
+                                                              // site will see the right values in
+                                                              // the fields of the class.
+                                                              // Alternatively we could use "ishst"
+                                                              // if we use load-acquire for the
+                                                              // class status load.)
+    dmb    ish
+    mov    x0, x3                                             // Set the return value and return.
+    ret
+.Lart_quick_alloc_object_rosalloc_slow_path:
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME      // save callee saves in case of GC
+    mov    x2, xSELF                       // pass Thread::Current
+    bl     artAllocObjectFromCodeRosAlloc  // (uint32_t type_idx, Method* method, Thread*)
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+END art_quick_alloc_object_rosalloc
 
     /*
      * Called by managed code when the thread has been asked to suspend.
diff --git a/runtime/art_method.cc b/runtime/art_method.cc
index a10d7af..fe0afa6 100644
--- a/runtime/art_method.cc
+++ b/runtime/art_method.cc
@@ -361,19 +361,6 @@
   return true;
 }
 
-ProfilingInfo* ArtMethod::CreateProfilingInfo() {
-  DCHECK(!Runtime::Current()->IsAotCompiler());
-  ProfilingInfo* info = ProfilingInfo::Create(this);
-  MemberOffset offset = ArtMethod::EntryPointFromJniOffset(sizeof(void*));
-  uintptr_t pointer = reinterpret_cast<uintptr_t>(this) + offset.Uint32Value();
-  if (!reinterpret_cast<Atomic<ProfilingInfo*>*>(pointer)->
-          CompareExchangeStrongSequentiallyConsistent(nullptr, info)) {
-    return GetProfilingInfo(sizeof(void*));
-  } else {
-    return info;
-  }
-}
-
 const uint8_t* ArtMethod::GetQuickenedInfo() {
   bool found = false;
   OatFile::OatMethod oat_method =
@@ -427,6 +414,12 @@
   bool found;
   OatFile::OatMethod oat_method = class_linker->FindOatMethodFor(this, &found);
   if (!found) {
+    if (class_linker->IsQuickResolutionStub(existing_entry_point)) {
+      // We are running the generic jni stub, but the entry point of the method has not
+      // been updated yet.
+      DCHECK(IsNative());
+      return nullptr;
+    }
     // Only for unit tests.
     // TODO(ngeoffray): Update these tests to pass the right pc?
     return OatQuickMethodHeader::FromEntryPoint(existing_entry_point);
diff --git a/runtime/art_method.h b/runtime/art_method.h
index bb9804e..551989d 100644
--- a/runtime/art_method.h
+++ b/runtime/art_method.h
@@ -305,12 +305,18 @@
         PtrSizedFields, entry_point_from_quick_compiled_code_) / sizeof(void*) * pointer_size);
   }
 
-  ProfilingInfo* CreateProfilingInfo() SHARED_REQUIRES(Locks::mutator_lock_);
-
   ProfilingInfo* GetProfilingInfo(size_t pointer_size) {
     return reinterpret_cast<ProfilingInfo*>(GetEntryPointFromJniPtrSize(pointer_size));
   }
 
+  ALWAYS_INLINE void SetProfilingInfo(ProfilingInfo* info) {
+    SetEntryPointFromJniPtrSize(info, sizeof(void*));
+  }
+
+  static MemberOffset ProfilingInfoOffset() {
+    return EntryPointFromJniOffset(sizeof(void*));
+  }
+
   void* GetEntryPointFromJni() {
     return GetEntryPointFromJniPtrSize(sizeof(void*));
   }
diff --git a/runtime/debugger.cc b/runtime/debugger.cc
index b17b76e..7117be9 100644
--- a/runtime/debugger.cc
+++ b/runtime/debugger.cc
@@ -69,29 +69,26 @@
   return alloc_record_count;
 }
 
-class Breakpoint {
+class Breakpoint : public ValueObject {
  public:
-  Breakpoint(ArtMethod* method, uint32_t dex_pc,
-             DeoptimizationRequest::Kind deoptimization_kind)
-    SHARED_REQUIRES(Locks::mutator_lock_)
-    : method_(nullptr), dex_pc_(dex_pc), deoptimization_kind_(deoptimization_kind) {
+  Breakpoint(ArtMethod* method, uint32_t dex_pc, DeoptimizationRequest::Kind deoptimization_kind)
+    : method_(method),
+      dex_pc_(dex_pc),
+      deoptimization_kind_(deoptimization_kind) {
     CHECK(deoptimization_kind_ == DeoptimizationRequest::kNothing ||
           deoptimization_kind_ == DeoptimizationRequest::kSelectiveDeoptimization ||
           deoptimization_kind_ == DeoptimizationRequest::kFullDeoptimization);
-    ScopedObjectAccessUnchecked soa(Thread::Current());
-    method_ = soa.EncodeMethod(method);
   }
 
   Breakpoint(const Breakpoint& other) SHARED_REQUIRES(Locks::mutator_lock_)
-    : method_(nullptr), dex_pc_(other.dex_pc_),
-      deoptimization_kind_(other.deoptimization_kind_) {
-    ScopedObjectAccessUnchecked soa(Thread::Current());
-    method_ = soa.EncodeMethod(other.Method());
-  }
+    : method_(other.method_),
+      dex_pc_(other.dex_pc_),
+      deoptimization_kind_(other.deoptimization_kind_) {}
 
-  ArtMethod* Method() const SHARED_REQUIRES(Locks::mutator_lock_) {
-    ScopedObjectAccessUnchecked soa(Thread::Current());
-    return soa.DecodeMethod(method_);
+  // Method() is called from root visiting, do not use ScopedObjectAccess here or it can cause
+  // GC to deadlock if another thread tries to call SuspendAll while the GC is in a runnable state.
+  ArtMethod* Method() const {
+    return method_;
   }
 
   uint32_t DexPc() const {
@@ -104,7 +101,7 @@
 
  private:
   // The location of this breakpoint.
-  jmethodID method_;
+  ArtMethod* method_;
   uint32_t dex_pc_;
 
   // Indicates whether breakpoint needs full deoptimization or selective deoptimization.
diff --git a/runtime/exception_test.cc b/runtime/exception_test.cc
index b1d4d35..18ccd08 100644
--- a/runtime/exception_test.cc
+++ b/runtime/exception_test.cc
@@ -92,10 +92,25 @@
     fake_header_code_and_maps_.insert(fake_header_code_and_maps_.end(),
                                       fake_code_.begin(), fake_code_.end());
 
-    // NOTE: Don't align the code (it will not be executed) but check that the Thumb2
-    // adjustment will be a NOP, see EntryPointToCodePointer().
-    CHECK_ALIGNED(mapping_table_offset, 2);
-    const uint8_t* code_ptr = &fake_header_code_and_maps_[gc_map_offset];
+    // Align the code.
+    const size_t alignment = GetInstructionSetAlignment(kRuntimeISA);
+    fake_header_code_and_maps_.reserve(fake_header_code_and_maps_.size() + alignment);
+    const void* unaligned_code_ptr =
+        fake_header_code_and_maps_.data() + (fake_header_code_and_maps_.size() - code_size);
+    size_t offset = dchecked_integral_cast<size_t>(reinterpret_cast<uintptr_t>(unaligned_code_ptr));
+    size_t padding = RoundUp(offset, alignment) - offset;
+    // Make sure no resizing takes place.
+    CHECK_GE(fake_header_code_and_maps_.capacity(), fake_header_code_and_maps_.size() + padding);
+    fake_header_code_and_maps_.insert(fake_header_code_and_maps_.begin(), padding, 0);
+    const void* code_ptr = reinterpret_cast<const uint8_t*>(unaligned_code_ptr) + padding;
+    CHECK_EQ(code_ptr,
+             static_cast<const void*>(fake_header_code_and_maps_.data() +
+                                          (fake_header_code_and_maps_.size() - code_size)));
+
+    if (kRuntimeISA == kArm) {
+      // Check that the Thumb2 adjustment will be a NOP, see EntryPointToCodePointer().
+      CHECK_ALIGNED(mapping_table_offset, 2);
+    }
 
     method_f_ = my_klass_->FindVirtualMethod("f", "()I", sizeof(void*));
     ASSERT_TRUE(method_f_ != nullptr);
diff --git a/runtime/jit/jit.h b/runtime/jit/jit.h
index e73ba82..1f89f9b 100644
--- a/runtime/jit/jit.h
+++ b/runtime/jit/jit.h
@@ -43,7 +43,7 @@
 class Jit {
  public:
   static constexpr bool kStressMode = kIsDebugBuild;
-  static constexpr size_t kDefaultCompileThreshold = kStressMode ? 2 : 1000;
+  static constexpr size_t kDefaultCompileThreshold = kStressMode ? 2 : 500;
   static constexpr size_t kDefaultWarmupThreshold = kDefaultCompileThreshold / 2;
 
   virtual ~Jit();
diff --git a/runtime/jit/jit_code_cache.cc b/runtime/jit/jit_code_cache.cc
index 2d0a2a5..60568b2 100644
--- a/runtime/jit/jit_code_cache.cc
+++ b/runtime/jit/jit_code_cache.cc
@@ -21,6 +21,7 @@
 #include "art_method-inl.h"
 #include "entrypoints/runtime_asm_entrypoints.h"
 #include "gc/accounting/bitmap-inl.h"
+#include "jit/profiling_info.h"
 #include "linear_alloc.h"
 #include "mem_map.h"
 #include "oat_file-inl.h"
@@ -56,9 +57,9 @@
     return nullptr;
   }
 
-  // Data cache is 1 / 4 of the map.
+  // Data cache is 1 / 2 of the map.
   // TODO: Make this variable?
-  size_t data_size = RoundUp(data_map->Size() / 4, kPageSize);
+  size_t data_size = RoundUp(data_map->Size() / 2, kPageSize);
   size_t code_size = data_map->Size() - data_size;
   uint8_t* divider = data_map->Begin() + data_size;
 
@@ -206,10 +207,23 @@
   // We do not check if a code cache GC is in progress, as this method comes
   // with the classlinker_classes_lock_ held, and suspending ourselves could
   // lead to a deadlock.
-  for (auto it = method_code_map_.begin(); it != method_code_map_.end();) {
-    if (alloc.ContainsUnsafe(it->second)) {
-      FreeCode(it->first, it->second);
-      it = method_code_map_.erase(it);
+  {
+    ScopedCodeCacheWrite scc(code_map_.get());
+    for (auto it = method_code_map_.begin(); it != method_code_map_.end();) {
+      if (alloc.ContainsUnsafe(it->second)) {
+        FreeCode(it->first, it->second);
+        it = method_code_map_.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+  for (auto it = profiling_infos_.begin(); it != profiling_infos_.end();) {
+    ProfilingInfo* info = *it;
+    if (alloc.ContainsUnsafe(info->GetMethod())) {
+      info->GetMethod()->SetProfilingInfo(nullptr);
+      mspace_free(data_mspace_, reinterpret_cast<uint8_t*>(info));
+      it = profiling_infos_.erase(it);
     } else {
       ++it;
     }
@@ -387,6 +401,9 @@
     for (auto& it : method_code_map_) {
       it.second->SetEntryPointFromQuickCompiledCode(GetQuickToInterpreterBridge());
     }
+    for (ProfilingInfo* info : profiling_infos_) {
+      info->GetMethod()->SetProfilingInfo(nullptr);
+    }
   }
 
   // Run a checkpoint on all threads to mark the JIT compiled code they are running.
@@ -400,27 +417,37 @@
     }
   }
 
-  // Free unused compiled code, and restore the entry point of used compiled code.
   {
     MutexLock mu(self, lock_);
     DCHECK_EQ(map_size, method_code_map_.size());
-    ScopedCodeCacheWrite scc(code_map_.get());
-    for (auto it = method_code_map_.begin(); it != method_code_map_.end();) {
-      const void* code_ptr = it->first;
-      ArtMethod* method = it->second;
-      uintptr_t allocation = FromCodeToAllocation(code_ptr);
-      const OatQuickMethodHeader* method_header = OatQuickMethodHeader::FromCodePointer(code_ptr);
-      if (GetLiveBitmap()->Test(allocation)) {
-        method->SetEntryPointFromQuickCompiledCode(method_header->GetEntryPoint());
-        ++it;
-      } else {
-        method->ClearCounter();
-        DCHECK_NE(method->GetEntryPointFromQuickCompiledCode(), method_header->GetEntryPoint());
-        FreeCode(code_ptr, method);
-        it = method_code_map_.erase(it);
+    // Free unused compiled code, and restore the entry point of used compiled code.
+    {
+      ScopedCodeCacheWrite scc(code_map_.get());
+      for (auto it = method_code_map_.begin(); it != method_code_map_.end();) {
+        const void* code_ptr = it->first;
+        ArtMethod* method = it->second;
+        uintptr_t allocation = FromCodeToAllocation(code_ptr);
+        const OatQuickMethodHeader* method_header = OatQuickMethodHeader::FromCodePointer(code_ptr);
+        if (GetLiveBitmap()->Test(allocation)) {
+          method->SetEntryPointFromQuickCompiledCode(method_header->GetEntryPoint());
+          ++it;
+        } else {
+          method->ClearCounter();
+          DCHECK_NE(method->GetEntryPointFromQuickCompiledCode(), method_header->GetEntryPoint());
+          FreeCode(code_ptr, method);
+          it = method_code_map_.erase(it);
+        }
       }
     }
     GetLiveBitmap()->Bitmap::Clear();
+
+    // Free all profiling info.
+    for (ProfilingInfo* info : profiling_infos_) {
+      DCHECK(info->GetMethod()->GetProfilingInfo(sizeof(void*)) == nullptr);
+      mspace_free(data_mspace_, reinterpret_cast<uint8_t*>(info));
+    }
+    profiling_infos_.clear();
+
     collection_in_progress_ = false;
     lock_cond_.Broadcast(self);
   }
@@ -460,5 +487,44 @@
   return method_header;
 }
 
+ProfilingInfo* JitCodeCache::AddProfilingInfo(Thread* self,
+                                              ArtMethod* method,
+                                              const std::vector<uint32_t>& entries,
+                                              bool retry_allocation) {
+  ProfilingInfo* info = AddProfilingInfoInternal(self, method, entries);
+
+  if (info == nullptr && retry_allocation) {
+    GarbageCollectCache(self);
+    info = AddProfilingInfoInternal(self, method, entries);
+  }
+  return info;
+}
+
+ProfilingInfo* JitCodeCache::AddProfilingInfoInternal(Thread* self,
+                                                      ArtMethod* method,
+                                                      const std::vector<uint32_t>& entries) {
+  size_t profile_info_size = RoundUp(
+      sizeof(ProfilingInfo) + sizeof(ProfilingInfo::InlineCache) * entries.size(),
+      sizeof(void*));
+  ScopedThreadSuspension sts(self, kSuspended);
+  MutexLock mu(self, lock_);
+  WaitForPotentialCollectionToComplete(self);
+
+  // Check whether some other thread has concurrently created it.
+  ProfilingInfo* info = method->GetProfilingInfo(sizeof(void*));
+  if (info != nullptr) {
+    return info;
+  }
+
+  uint8_t* data = reinterpret_cast<uint8_t*>(mspace_malloc(data_mspace_, profile_info_size));
+  if (data == nullptr) {
+    return nullptr;
+  }
+  info = new (data) ProfilingInfo(method, entries);
+  method->SetProfilingInfo(info);
+  profiling_infos_.push_back(info);
+  return info;
+}
+
 }  // namespace jit
 }  // namespace art
diff --git a/runtime/jit/jit_code_cache.h b/runtime/jit/jit_code_cache.h
index 4e415b8..e10f962 100644
--- a/runtime/jit/jit_code_cache.h
+++ b/runtime/jit/jit_code_cache.h
@@ -35,6 +35,7 @@
 
 class ArtMethod;
 class LinearAlloc;
+class ProfilingInfo;
 
 namespace jit {
 
@@ -109,11 +110,21 @@
       REQUIRES(!lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // Remove all methods in our cache that were allocated by 'alloc'.
   void RemoveMethodsIn(Thread* self, const LinearAlloc& alloc)
       REQUIRES(!lock_)
       REQUIRES(Locks::classlinker_classes_lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  // Create a 'ProfileInfo' for 'method'. If 'retry_allocation' is true,
+  // will collect and retry if the first allocation is unsuccessful.
+  ProfilingInfo* AddProfilingInfo(Thread* self,
+                                  ArtMethod* method,
+                                  const std::vector<uint32_t>& entries,
+                                  bool retry_allocation)
+      REQUIRES(!lock_)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
  private:
   // Take ownership of code_mem_map.
   JitCodeCache(MemMap* code_map, MemMap* data_map);
@@ -133,6 +144,12 @@
       REQUIRES(!lock_)
       SHARED_REQUIRES(Locks::mutator_lock_);
 
+  ProfilingInfo* AddProfilingInfoInternal(Thread* self,
+                                          ArtMethod* method,
+                                          const std::vector<uint32_t>& entries)
+      REQUIRES(!lock_)
+      SHARED_REQUIRES(Locks::mutator_lock_);
+
   // If a collection is in progress, wait for it to finish. Return
   // whether the thread actually waited.
   bool WaitForPotentialCollectionToComplete(Thread* self)
@@ -157,8 +174,10 @@
   void* data_mspace_ GUARDED_BY(lock_);
   // Bitmap for collecting code and data.
   std::unique_ptr<CodeCacheBitmap> live_bitmap_;
-  // This map holds compiled code associated to the ArtMethod
+  // This map holds compiled code associated to the ArtMethod.
   SafeMap<const void*, ArtMethod*> method_code_map_ GUARDED_BY(lock_);
+  // ProfilingInfo objects we have allocated.
+  std::vector<ProfilingInfo*> profiling_infos_ GUARDED_BY(lock_);
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(JitCodeCache);
 };
diff --git a/runtime/jit/jit_instrumentation.cc b/runtime/jit/jit_instrumentation.cc
index 666b8e7..8aaa5fa 100644
--- a/runtime/jit/jit_instrumentation.cc
+++ b/runtime/jit/jit_instrumentation.cc
@@ -26,7 +26,12 @@
 
 class JitCompileTask FINAL : public Task {
  public:
-  explicit JitCompileTask(ArtMethod* method) : method_(method) {
+  enum TaskKind {
+    kAllocateProfile,
+    kCompile
+  };
+
+  JitCompileTask(ArtMethod* method, TaskKind kind) : method_(method), kind_(kind) {
     ScopedObjectAccess soa(Thread::Current());
     // Add a global ref to the class to prevent class unloading until compilation is done.
     klass_ = soa.Vm()->AddGlobalRef(soa.Self(), method_->GetDeclaringClass());
@@ -40,9 +45,16 @@
 
   void Run(Thread* self) OVERRIDE {
     ScopedObjectAccess soa(self);
-    VLOG(jit) << "JitCompileTask compiling method " << PrettyMethod(method_);
-    if (!Runtime::Current()->GetJit()->CompileMethod(method_, self)) {
-      VLOG(jit) << "Failed to compile method " << PrettyMethod(method_);
+    if (kind_ == kCompile) {
+      VLOG(jit) << "JitCompileTask compiling method " << PrettyMethod(method_);
+      if (!Runtime::Current()->GetJit()->CompileMethod(method_, self)) {
+        VLOG(jit) << "Failed to compile method " << PrettyMethod(method_);
+      }
+    } else {
+      DCHECK(kind_ == kAllocateProfile);
+      if (ProfilingInfo::Create(self, method_, /* retry_allocation */ true)) {
+        VLOG(jit) << "Start profiling " << PrettyMethod(method_);
+      }
     }
   }
 
@@ -52,6 +64,7 @@
 
  private:
   ArtMethod* const method_;
+  const TaskKind kind_;
   jobject klass_;
 
   DISALLOW_IMPLICIT_CONSTRUCTORS(JitCompileTask);
@@ -73,7 +86,6 @@
 }
 
 void JitInstrumentationCache::AddSamples(Thread* self, ArtMethod* method, size_t) {
-  ScopedObjectAccessUnchecked soa(self);
   // Since we don't have on-stack replacement, some methods can remain in the interpreter longer
   // than we want resulting in samples even after the method is compiled.
   if (method->IsClassInitializer() || method->IsNative()) {
@@ -85,14 +97,20 @@
   }
   uint16_t sample_count = method->IncrementCounter();
   if (sample_count == warm_method_threshold_) {
-    ProfilingInfo* info = method->CreateProfilingInfo();
-    if (info != nullptr) {
+    if (ProfilingInfo::Create(self, method, /* retry_allocation */ false)) {
       VLOG(jit) << "Start profiling " << PrettyMethod(method);
+    } else {
+      // We failed allocating. Instead of doing the collection on the Java thread, we push
+      // an allocation to a compiler thread, that will do the collection.
+      thread_pool_->AddTask(self, new JitCompileTask(
+          method->GetInterfaceMethodIfProxy(sizeof(void*)), JitCompileTask::kAllocateProfile));
+      thread_pool_->StartWorkers(self);
     }
   }
+
   if (sample_count == hot_method_threshold_) {
     thread_pool_->AddTask(self, new JitCompileTask(
-        method->GetInterfaceMethodIfProxy(sizeof(void*))));
+        method->GetInterfaceMethodIfProxy(sizeof(void*)), JitCompileTask::kCompile));
     thread_pool_->StartWorkers(self);
   }
 }
@@ -107,14 +125,18 @@
                                                           ArtMethod* caller,
                                                           uint32_t dex_pc,
                                                           ArtMethod* callee ATTRIBUTE_UNUSED) {
+  instrumentation_cache_->AddSamples(thread, caller, 1);
+  // We make sure we cannot be suspended, as the profiling info can be concurrently deleted.
+  thread->StartAssertNoThreadSuspension("Instrumenting invoke");
   DCHECK(this_object != nullptr);
   ProfilingInfo* info = caller->GetProfilingInfo(sizeof(void*));
   if (info != nullptr) {
     // Since the instrumentation is marked from the declaring class we need to mark the card so
     // that mod-union tables and card rescanning know about the update.
     Runtime::Current()->GetHeap()->WriteBarrierEveryFieldOf(caller->GetDeclaringClass());
-    info->AddInvokeInfo(thread, dex_pc, this_object->GetClass());
+    info->AddInvokeInfo(dex_pc, this_object->GetClass());
   }
+  thread->EndAssertNoThreadSuspension(nullptr);
 }
 
 void JitInstrumentationCache::WaitForCompilationToFinish(Thread* self) {
diff --git a/runtime/jit/profiling_info.cc b/runtime/jit/profiling_info.cc
index 7c5f78e..2e52b1b 100644
--- a/runtime/jit/profiling_info.cc
+++ b/runtime/jit/profiling_info.cc
@@ -25,7 +25,7 @@
 
 namespace art {
 
-ProfilingInfo* ProfilingInfo::Create(ArtMethod* method) {
+bool ProfilingInfo::Create(Thread* self, ArtMethod* method, bool retry_allocation) {
   // Walk over the dex instructions of the method and keep track of
   // instructions we are interested in profiling.
   DCHECK(!method->IsNative());
@@ -57,23 +57,15 @@
   // If there is no instruction we are interested in, no need to create a `ProfilingInfo`
   // object, it will never be filled.
   if (entries.empty()) {
-    return nullptr;
+    return true;
   }
 
   // Allocate the `ProfilingInfo` object int the JIT's data space.
   jit::JitCodeCache* code_cache = Runtime::Current()->GetJit()->GetCodeCache();
-  size_t profile_info_size = sizeof(ProfilingInfo) + sizeof(InlineCache) * entries.size();
-  uint8_t* data = code_cache->ReserveData(Thread::Current(), profile_info_size);
-
-  if (data == nullptr) {
-    VLOG(jit) << "Cannot allocate profiling info anymore";
-    return nullptr;
-  }
-
-  return new (data) ProfilingInfo(entries);
+  return code_cache->AddProfilingInfo(self, method, entries, retry_allocation) != nullptr;
 }
 
-void ProfilingInfo::AddInvokeInfo(Thread* self, uint32_t dex_pc, mirror::Class* cls) {
+void ProfilingInfo::AddInvokeInfo(uint32_t dex_pc, mirror::Class* cls) {
   InlineCache* cache = nullptr;
   // TODO: binary search if array is too long.
   for (size_t i = 0; i < number_of_inline_caches_; ++i) {
@@ -84,7 +76,6 @@
   }
   DCHECK(cache != nullptr);
 
-  ScopedObjectAccess soa(self);
   for (size_t i = 0; i < InlineCache::kIndividualCacheSize; ++i) {
     mirror::Class* existing = cache->classes_[i].Read();
     if (existing == cls) {
diff --git a/runtime/jit/profiling_info.h b/runtime/jit/profiling_info.h
index 7a2d1a8..b13a315 100644
--- a/runtime/jit/profiling_info.h
+++ b/runtime/jit/profiling_info.h
@@ -26,6 +26,10 @@
 
 class ArtMethod;
 
+namespace jit {
+class JitCodeCache;
+}
+
 namespace mirror {
 class Class;
 }
@@ -36,10 +40,17 @@
  */
 class ProfilingInfo {
  public:
-  static ProfilingInfo* Create(ArtMethod* method) SHARED_REQUIRES(Locks::mutator_lock_);
+  // Create a ProfilingInfo for 'method'. Return whether it succeeded, or if it is
+  // not needed in case the method does not have virtual/interface invocations.
+  static bool Create(Thread* self, ArtMethod* method, bool retry_allocation)
+      SHARED_REQUIRES(Locks::mutator_lock_);
 
   // Add information from an executed INVOKE instruction to the profile.
-  void AddInvokeInfo(Thread* self, uint32_t dex_pc, mirror::Class* cls);
+  void AddInvokeInfo(uint32_t dex_pc, mirror::Class* cls)
+      // Method should not be interruptible, as it manipulates the ProfilingInfo
+      // which can be concurrently collected.
+      REQUIRES(Roles::uninterruptible_)
+      SHARED_REQUIRES(Locks::mutator_lock_);
 
   // NO_THREAD_SAFETY_ANALYSIS since we don't know what the callback requires.
   template<typename RootVisitorType>
@@ -52,6 +63,10 @@
     }
   }
 
+  ArtMethod* GetMethod() const {
+    return method_;
+  }
+
  private:
   // Structure to store the classes seen at runtime for a specific instruction.
   // Once the classes_ array is full, we consider the INVOKE to be megamorphic.
@@ -84,8 +99,9 @@
     GcRoot<mirror::Class> classes_[kIndividualCacheSize];
   };
 
-  explicit ProfilingInfo(const std::vector<uint32_t>& entries)
-      : number_of_inline_caches_(entries.size()) {
+  ProfilingInfo(ArtMethod* method, const std::vector<uint32_t>& entries)
+      : number_of_inline_caches_(entries.size()),
+        method_(method) {
     memset(&cache_, 0, number_of_inline_caches_ * sizeof(InlineCache));
     for (size_t i = 0; i < number_of_inline_caches_; ++i) {
       cache_[i].dex_pc = entries[i];
@@ -95,9 +111,14 @@
   // Number of instructions we are profiling in the ArtMethod.
   const uint32_t number_of_inline_caches_;
 
+  // Method this profiling info is for.
+  ArtMethod* const method_;
+
   // Dynamically allocated array of size `number_of_inline_caches_`.
   InlineCache cache_[0];
 
+  friend class jit::JitCodeCache;
+
   DISALLOW_COPY_AND_ASSIGN(ProfilingInfo);
 };
 
diff --git a/runtime/oat_quick_method_header.h b/runtime/oat_quick_method_header.h
index c9a2cfb..03cad08 100644
--- a/runtime/oat_quick_method_header.h
+++ b/runtime/oat_quick_method_header.h
@@ -43,6 +43,8 @@
   static OatQuickMethodHeader* FromCodePointer(const void* code_ptr) {
     uintptr_t code = reinterpret_cast<uintptr_t>(code_ptr);
     uintptr_t header = code - OFFSETOF_MEMBER(OatQuickMethodHeader, code_);
+    DCHECK(IsAlignedParam(code, GetInstructionSetAlignment(kRuntimeISA)) ||
+           IsAlignedParam(header, GetInstructionSetAlignment(kRuntimeISA)));
     return reinterpret_cast<OatQuickMethodHeader*>(header);
   }
 
diff --git a/test/004-ThreadStress/run b/test/004-ThreadStress/run
new file mode 100755
index 0000000..27c501d
--- /dev/null
+++ b/test/004-ThreadStress/run
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Be less agressive than the default debug option for the jit code cache
+# to avoid timeouts.
+exec ${RUN} "$@" --runtime-option -Xjitcodecachesize:1M
diff --git a/test/etc/run-test-jar b/test/etc/run-test-jar
index 280b4bc..18867fd 100755
--- a/test/etc/run-test-jar
+++ b/test/etc/run-test-jar
@@ -214,7 +214,7 @@
 
 if [ "$USE_JVM" = "n" ]; then
     for feature in ${EXPERIMENTAL}; do
-        FLAGS="${FLAGS} -Xexperimental:${feature}"
+        FLAGS="${FLAGS} -Xexperimental:${feature} -Xcompiler-option --runtime-arg -Xcompiler-option -Xexperimental:${feature}"
         COMPILE_FLAGS="${COMPILE_FLAGS} --runtime-arg -Xexperimental:${feature}"
     done
 fi
diff --git a/test/run-all-tests b/test/run-all-tests
index 76283b7..6d5c28c 100755
--- a/test/run-all-tests
+++ b/test/run-all-tests
@@ -44,12 +44,45 @@
     elif [ "x$1" = "x--use-java-home" ]; then
         run_args="${run_args} --use-java-home"
         shift
+    elif [ "x$1" = "x--no-image" ]; then
+        run_args="${run_args} --no-image"
+        shift
+    elif [ "x$1" = "x--quick" ]; then
+        run_args="${run_args} --quick"
+        shift
+    elif [ "x$1" = "x--optimizing" ]; then
+        run_args="${run_args} --optimizing"
+        shift
+    elif [ "x$1" = "x--image" ]; then
+        run_args="${run_args} --image"
+        shift
+    elif [ "x$1" = "x--never-clean" ]; then
+        run_args="${run_args} --never-clean"
+        shift
     elif [ "x$1" = "x--jvm" ]; then
         run_args="${run_args} --jvm"
         shift
     elif [ "x$1" = "x--debug" ]; then
         run_args="${run_args} --debug"
         shift
+    elif [ "x$1" = "x--build-only" ]; then
+        run_args="${run_args} --build-only"
+        shift
+    elif [ "x$1" = "x--build-with-jack" ]; then
+        run_args="${run_args} --build-with-jack"
+        shift
+    elif [ "x$1" = "x--build-with-javac-dx" ]; then
+        run_args="${run_args} --build-with-javac-dx"
+        shift
+    elif [ "x$1" = "x--dex2oat-swap" ]; then
+        run_args="${run_args} --dex2oat-swap"
+        shift
+    elif [ "x$1" = "x--dalvik" ]; then
+        run_args="${run_args} --dalvik"
+        shift
+    elif [ "x$1" = "x--debuggable" ]; then
+        run_args="${run_args} --debuggable"
+        shift
     elif [ "x$1" = "x--zygote" ]; then
         run_args="${run_args} --zygote"
         shift
@@ -59,15 +92,15 @@
     elif [ "x$1" = "x--jit" ]; then
         run_args="${run_args} --jit"
         shift
+    elif [ "x$1" = "x--verify-soft-fail" ]; then
+        run_args="${run_args} --verify-soft-fail"
+        shift
     elif [ "x$1" = "x--no-verify" ]; then
         run_args="${run_args} --no-verify"
         shift
     elif [ "x$1" = "x--no-optimize" ]; then
         run_args="${run_args} --no-optimize"
         shift
-    elif [ "x$1" = "x--valgrind" ]; then
-        run_args="${run_args} --valgrind"
-        shift
     elif [ "x$1" = "x--dev" ]; then
         run_args="${run_args} --dev"
         shift
@@ -116,6 +149,15 @@
     elif [ "x$1" = "x--always-clean" ]; then
         run_args="${run_args} --always-clean"
         shift
+    elif [ "x$1" = "x--pic-test" ]; then
+        run_args="${run_args} --pic-test"
+        shift
+    elif [ "x$1" = "x--pic-image" ]; then
+        run_args="${run_args} --pic-image"
+        shift
+    elif [ "x$1" = "x--strace" ]; then
+        run_args="${run_args} --strace"
+        shift
     elif expr "x$1" : "x--" >/dev/null 2>&1; then
         echo "unknown $0 option: $1" 1>&2
         usage="yes"
@@ -134,9 +176,13 @@
         echo "  Options are all passed to run-test; refer to that for " \
              "further documentation:"
         echo "    --debug --dev --host --interpreter --jit --jvm --no-optimize"
-        echo "    --no-verify -O --update --valgrind --zygote --64 --relocate"
-        echo "    --prebuild --always-clean --gcstress --gcverify --trace"
-        echo "    --no-patchoat --no-dex2oat --use-java-home"
+        echo "    --no-verify --verify-soft-fail -O --update --zygote --64"
+        echo "    --relocate --prebuild --always-clean --gcstress --gcverify"
+        echo "    --trace --no-patchoat --no-dex2oat --use-java-home --pic-image"
+        echo "    --pic-test --strace --debuggable --dalvik --dex2oat-swap"
+        echo "    --build-only --build-with-jack --build-with-javac-dx"
+        echo "    --never-clean --image --no-image --quick --optimizing"
+        echo "    --no-relocate --no-prebuild"
         echo "  Specific Runtime Options:"
         echo "    --seq                Run tests one-by-one, avoiding failures caused by busy CPU"
     ) 1>&2
diff --git a/test/run-test b/test/run-test
index 5a43fb0..3442fcf 100755
--- a/test/run-test
+++ b/test/run-test
@@ -528,6 +528,7 @@
         echo "    --debug               Wait for a debugger to attach."
         echo "    --debuggable          Whether to compile Java code for a debugger."
         echo "    --gdb                 Run under gdb; incompatible with some tests."
+        echo "    --gdb-arg             Pass an option to gdb."
         echo "    --build-only          Build test files only (off by default)."
         echo "    --build-with-javac-dx Build test files with javac and dx (on by default)."
         echo "    --build-with-jack     Build test files with jack and jill (off by default)."
@@ -553,6 +554,8 @@
         echo "                          the image and oat files be relocated to a random"
         echo "                          address before running. (default)"
         echo "    --no-relocate         Force the use of no relocating in the test"
+        echo "    --image               Run the test using a precompiled boot image. (default)"
+        echo "    --no-image            Run the test without a precompiled boot image."
         echo "    --host                Use the host-mode virtual machine."
         echo "    --invoke-with         Pass --invoke-with option to runtime."
         echo "    --dalvik              Use Dalvik (off by default)."
@@ -564,6 +567,7 @@
              "files."
         echo "    --64                  Run the test in 64-bit mode"
         echo "    --trace               Run with method tracing"
+        echo "    --strace              Run with syscall tracing from strace."
         echo "    --stream              Run method tracing in streaming mode (requires --trace)"
         echo "    --gcstress            Run with gc stress testing"
         echo "    --gcverify            Run with gc verification"
@@ -573,6 +577,9 @@
         echo "    --dex2oat-swap        Use a dex2oat swap file."
         echo "    --instruction-set-features [string]"
         echo "                          Set instruction-set-features for compilation."
+        echo "    --pic-image           Use an image compiled with position independent code for the"
+        echo "                          boot class path."
+        echo "    --pic-test            Compile the test code position independent."
     ) 1>&2
     exit 1
 fi
diff --git a/tools/run-jdwp-tests.sh b/tools/run-jdwp-tests.sh
index 9aed271..de27a6f 100755
--- a/tools/run-jdwp-tests.sh
+++ b/tools/run-jdwp-tests.sh
@@ -30,8 +30,6 @@
 
 art="/data/local/tmp/system/bin/art"
 art_debugee="sh /data/local/tmp/system/bin/art"
-# We use Quick's image on target because optimizing's image is not compiled debuggable.
-image="-Ximage:/data/art-test/core.art"
 args=$@
 debuggee_args="-Xcompiler-option --debuggable"
 device_dir="--device-dir=/data/local/tmp"
@@ -41,6 +39,8 @@
 image_compiler_option=""
 debug="no"
 verbose="no"
+image="-Ximage:/data/art-test/core-jit.art"
+vm_args=""
 # By default, we run the whole JDWP test suite.
 test="org.apache.harmony.jpda.tests.share.AllTests"
 
@@ -88,7 +88,10 @@
   fi
 done
 
-vm_args="--vm-arg $image --vm-arg -Xusejit:true"
+if [[ "$image" != "" ]]; then
+  vm_args="--vm-arg $image"
+fi
+vm_args="$vm_args --vm-arg -Xusejit:true"
 debuggee_args="$debuggee_args -Xusejit:true"
 if [[ $debug == "yes" ]]; then
   art="$art -d"