Merge "ART: Refactor libarttest C++ code"
diff --git a/compiler/dex/quick/codegen_util.cc b/compiler/dex/quick/codegen_util.cc
index 7082bed..d5ac341 100644
--- a/compiler/dex/quick/codegen_util.cc
+++ b/compiler/dex/quick/codegen_util.cc
@@ -1126,7 +1126,7 @@
     for (size_t i = 0 ; i < core_vmap_table_.size(); ++i) {
       // Copy, stripping out the phys register sort key.
       vmap_encoder.PushBackUnsigned(
-          ~(-1 << VREG_NUM_WIDTH) & (core_vmap_table_[i] + VmapTable::kEntryAdjustment));
+          ~(~0u << VREG_NUM_WIDTH) & (core_vmap_table_[i] + VmapTable::kEntryAdjustment));
     }
     // Push a marker to take place of lr.
     vmap_encoder.PushBackUnsigned(VmapTable::kAdjustedFpMarker);
@@ -1141,7 +1141,7 @@
       for (size_t i = 0 ; i < fp_vmap_table_.size(); ++i) {
         // Copy, stripping out the phys register sort key.
         vmap_encoder.PushBackUnsigned(
-            ~(-1 << VREG_NUM_WIDTH) & (fp_vmap_table_[i] + VmapTable::kEntryAdjustment));
+            ~(~0u << VREG_NUM_WIDTH) & (fp_vmap_table_[i] + VmapTable::kEntryAdjustment));
       }
     }
   } else {
diff --git a/compiler/optimizing/bounds_check_elimination.cc b/compiler/optimizing/bounds_check_elimination.cc
index 42b3541..960f4d9 100644
--- a/compiler/optimizing/bounds_check_elimination.cc
+++ b/compiler/optimizing/bounds_check_elimination.cc
@@ -410,7 +410,7 @@
  * of an existing value range, NewArray or a loop phi corresponding to an
  * incrementing/decrementing array index (MonotonicValueRange).
  */
-class ValueRange : public ArenaObject<kArenaAllocMisc> {
+class ValueRange : public ArenaObject<kArenaAllocBoundsCheckElimination> {
  public:
   ValueRange(ArenaAllocator* allocator, ValueBound lower, ValueBound upper)
       : allocator_(allocator), lower_(lower), upper_(upper) {}
@@ -1112,7 +1112,14 @@
 
   BCEVisitor(HGraph* graph, HInductionVarAnalysis* induction_analysis)
       : HGraphVisitor(graph),
-        maps_(graph->GetBlocks().size()),
+        maps_(graph->GetBlocks().size(),
+              ArenaSafeMap<int, ValueRange*>(
+                  std::less<int>(),
+                  graph->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)),
+              graph->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)),
+        first_constant_index_bounds_check_map_(
+            std::less<int>(),
+            graph->GetArena()->Adapter(kArenaAllocBoundsCheckElimination)),
         need_to_revisit_block_(false),
         initial_block_size_(graph->GetBlocks().size()),
         induction_range_(induction_analysis) {}
@@ -1137,14 +1144,9 @@
       // Added blocks don't keep value ranges.
       return nullptr;
     }
-    int block_id = basic_block->GetBlockId();
-    if (maps_.at(block_id) == nullptr) {
-      std::unique_ptr<ArenaSafeMap<int, ValueRange*>> map(
-          new ArenaSafeMap<int, ValueRange*>(
-              std::less<int>(), GetGraph()->GetArena()->Adapter()));
-      maps_.at(block_id) = std::move(map);
-    }
-    return maps_.at(block_id).get();
+    uint32_t block_id = basic_block->GetBlockId();
+    DCHECK_LT(block_id, maps_.size());
+    return &maps_[block_id];
   }
 
   // Traverse up the dominator tree to look for value range info.
@@ -1842,11 +1844,11 @@
     }
   }
 
-  std::vector<std::unique_ptr<ArenaSafeMap<int, ValueRange*>>> maps_;
+  ArenaVector<ArenaSafeMap<int, ValueRange*>> maps_;
 
   // Map an HArrayLength instruction's id to the first HBoundsCheck instruction in
   // a block that checks a constant index against that HArrayLength.
-  SafeMap<int, HBoundsCheck*> first_constant_index_bounds_check_map_;
+  ArenaSafeMap<int, HBoundsCheck*> first_constant_index_bounds_check_map_;
 
   // For the block, there is at least one HArrayLength instruction for which there
   // is more than one bounds check instruction with constant indexing. And it's
diff --git a/compiler/optimizing/builder.cc b/compiler/optimizing/builder.cc
index 7ae405a..cb36f62 100644
--- a/compiler/optimizing/builder.cc
+++ b/compiler/optimizing/builder.cc
@@ -324,7 +324,7 @@
   // Keep a map of all try blocks and their respective TryItems. We do not use
   // the block's pointer but rather its id to ensure deterministic iteration.
   ArenaSafeMap<uint32_t, const DexFile::TryItem*> try_block_info(
-      std::less<uint32_t>(), arena_->Adapter());
+      std::less<uint32_t>(), arena_->Adapter(kArenaAllocGraphBuilder));
 
   // Obtain TryItem information for blocks with throwing instructions, and split
   // blocks which are both try & catch to simplify the graph.
diff --git a/compiler/optimizing/code_generator.cc b/compiler/optimizing/code_generator.cc
index 3c6a41d..be05691 100644
--- a/compiler/optimizing/code_generator.cc
+++ b/compiler/optimizing/code_generator.cc
@@ -203,13 +203,13 @@
 
 void CodeGenerator::GenerateSlowPaths() {
   size_t code_start = 0;
-  for (size_t i = 0, e = slow_paths_.Size(); i < e; ++i) {
+  for (SlowPathCode* slow_path : slow_paths_) {
     if (disasm_info_ != nullptr) {
       code_start = GetAssembler()->CodeSize();
     }
-    slow_paths_.Get(i)->EmitNativeCode(this);
+    slow_path->EmitNativeCode(this);
     if (disasm_info_ != nullptr) {
-      disasm_info_->AddSlowPathInterval(slow_paths_.Get(i), code_start, GetAssembler()->CodeSize());
+      disasm_info_->AddSlowPathInterval(slow_path, code_start, GetAssembler()->CodeSize());
     }
   }
 }
diff --git a/compiler/optimizing/code_generator.h b/compiler/optimizing/code_generator.h
index a1c6db0..5da0e59 100644
--- a/compiler/optimizing/code_generator.h
+++ b/compiler/optimizing/code_generator.h
@@ -261,7 +261,7 @@
   bool IsImplicitNullCheckAllowed(HNullCheck* null_check) const;
 
   void AddSlowPath(SlowPathCode* slow_path) {
-    slow_paths_.Add(slow_path);
+    slow_paths_.push_back(slow_path);
   }
 
   void SetSrcMap(DefaultSrcMap* src_map) { src_map_ = src_map; }
@@ -425,9 +425,12 @@
         core_spill_mask_(0),
         fpu_spill_mask_(0),
         first_register_slot_in_slow_path_(0),
-        blocked_core_registers_(graph->GetArena()->AllocArray<bool>(number_of_core_registers)),
-        blocked_fpu_registers_(graph->GetArena()->AllocArray<bool>(number_of_fpu_registers)),
-        blocked_register_pairs_(graph->GetArena()->AllocArray<bool>(number_of_register_pairs)),
+        blocked_core_registers_(graph->GetArena()->AllocArray<bool>(number_of_core_registers,
+                                                                    kArenaAllocCodeGenerator)),
+        blocked_fpu_registers_(graph->GetArena()->AllocArray<bool>(number_of_fpu_registers,
+                                                                   kArenaAllocCodeGenerator)),
+        blocked_register_pairs_(graph->GetArena()->AllocArray<bool>(number_of_register_pairs,
+                                                                    kArenaAllocCodeGenerator)),
         number_of_core_registers_(number_of_core_registers),
         number_of_fpu_registers_(number_of_fpu_registers),
         number_of_register_pairs_(number_of_register_pairs),
@@ -441,10 +444,12 @@
         graph_(graph),
         compiler_options_(compiler_options),
         src_map_(nullptr),
-        slow_paths_(graph->GetArena(), 8),
+        slow_paths_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
         current_block_index_(0),
         is_leaf_(true),
-        requires_current_method_(false) {}
+        requires_current_method_(false) {
+    slow_paths_.reserve(8);
+  }
 
   // Register allocation logic.
   void AllocateRegistersLocally(HInstruction* instruction) const;
@@ -485,8 +490,20 @@
     return instruction_set == kX86 || instruction_set == kX86_64;
   }
 
-  // Arm64 has its own type for a label, so we need to templatize this method
+  // Arm64 has its own type for a label, so we need to templatize these methods
   // to share the logic.
+
+  template <typename LabelType>
+  LabelType* CommonInitializeLabels() {
+    size_t size = GetGraph()->GetBlocks().size();
+    LabelType* labels = GetGraph()->GetArena()->AllocArray<LabelType>(size,
+                                                                      kArenaAllocCodeGenerator);
+    for (size_t i = 0; i != size; ++i) {
+      new(labels + i) LabelType();
+    }
+    return labels;
+  }
+
   template <typename LabelType>
   LabelType* CommonGetLabelOf(LabelType* raw_pointer_to_labels_array, HBasicBlock* block) const {
     block = FirstNonEmptyBlock(block);
@@ -539,7 +556,7 @@
 
   // Native to dex_pc map used for native debugging/profiling tools.
   DefaultSrcMap* src_map_;
-  GrowableArray<SlowPathCode*> slow_paths_;
+  ArenaVector<SlowPathCode*> slow_paths_;
 
   // The current block index in `block_order_` of the block
   // we are generating code for.
diff --git a/compiler/optimizing/code_generator_arm.cc b/compiler/optimizing/code_generator_arm.cc
index d7b1d24..c775e03 100644
--- a/compiler/optimizing/code_generator_arm.cc
+++ b/compiler/optimizing/code_generator_arm.cc
@@ -428,19 +428,25 @@
                     kNumberOfRegisterPairs,
                     ComputeRegisterMask(reinterpret_cast<const int*>(kCoreCalleeSaves),
                                         arraysize(kCoreCalleeSaves)),
-                    ComputeRegisterMask(reinterpret_cast<const int*>(kFpuCalleeSaves),
-                                        arraysize(kFpuCalleeSaves)),
+                    graph->IsDebuggable()
+                        // If the graph is debuggable, we need to save the fpu registers ourselves,
+                        // as the stubs do not do it.
+                        ? 0
+                        : ComputeRegisterMask(reinterpret_cast<const int*>(kFpuCalleeSaves),
+                                              arraysize(kFpuCalleeSaves)),
                     compiler_options,
                     stats),
-      block_labels_(graph->GetArena(), 0),
+      block_labels_(nullptr),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
       assembler_(),
       isa_features_(isa_features),
-      method_patches_(MethodReferenceComparator(), graph->GetArena()->Adapter()),
-      call_patches_(MethodReferenceComparator(), graph->GetArena()->Adapter()),
-      relative_call_patches_(graph->GetArena()->Adapter()) {
+      method_patches_(MethodReferenceComparator(),
+                      graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      call_patches_(MethodReferenceComparator(),
+                    graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Always save the LR register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(LR));
 }
@@ -459,8 +465,8 @@
   for (HBasicBlock* block : *block_order_) {
     // Get the label directly from block_labels_ rather than through GetLabelOf() to avoid
     // FirstNonEmptyBlock() which could lead to adjusting a label more than once.
-    DCHECK_LT(static_cast<size_t>(block->GetBlockId()), block_labels_.Size());
-    Label* block_label = &block_labels_.GetRawStorage()[block->GetBlockId()];
+    DCHECK_LT(block->GetBlockId(), GetGraph()->GetBlocks().size());
+    Label* block_label = &block_labels_[block->GetBlockId()];
     DCHECK_EQ(block_label->IsBound(), !block->IsSingleJump());
     if (block_label->IsBound()) {
       __ AdjustLabelPosition(block_label);
@@ -4034,7 +4040,8 @@
 }
 
 void ParallelMoveResolverARM::EmitMove(size_t index) {
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   Location source = move->GetSource();
   Location destination = move->GetDestination();
 
@@ -4166,7 +4173,8 @@
 }
 
 void ParallelMoveResolverARM::EmitSwap(size_t index) {
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   Location source = move->GetSource();
   Location destination = move->GetDestination();
 
diff --git a/compiler/optimizing/code_generator_arm.h b/compiler/optimizing/code_generator_arm.h
index 1d98789..111112e 100644
--- a/compiler/optimizing/code_generator_arm.h
+++ b/compiler/optimizing/code_generator_arm.h
@@ -299,11 +299,11 @@
   void MarkGCCard(Register temp, Register card, Register object, Register value, bool can_be_null);
 
   Label* GetLabelOf(HBasicBlock* block) const {
-    return CommonGetLabelOf<Label>(block_labels_.GetRawStorage(), block);
+    return CommonGetLabelOf<Label>(block_labels_, block);
   }
 
   void Initialize() OVERRIDE {
-    block_labels_.SetSize(GetGraph()->GetBlocks().size());
+    block_labels_ = CommonInitializeLabels<Label>();
   }
 
   void Finalize(CodeAllocator* allocator) OVERRIDE;
@@ -335,7 +335,7 @@
   Literal* DeduplicateMethodCodeLiteral(MethodReference target_method);
 
   // Labels for each block that will be compiled.
-  GrowableArray<Label> block_labels_;
+  Label* block_labels_;  // Indexed by block id.
   Label frame_entry_label_;
   LocationsBuilderARM location_builder_;
   InstructionCodeGeneratorARM instruction_visitor_;
diff --git a/compiler/optimizing/code_generator_arm64.cc b/compiler/optimizing/code_generator_arm64.cc
index d175532..70327af 100644
--- a/compiler/optimizing/code_generator_arm64.cc
+++ b/compiler/optimizing/code_generator_arm64.cc
@@ -119,8 +119,11 @@
 
   CPURegList core_list = CPURegList(CPURegister::kRegister, kXRegSize,
       register_set->GetCoreRegisters() & (~callee_saved_core_registers.list()));
-  CPURegList fp_list = CPURegList(CPURegister::kFPRegister, kDRegSize,
-      register_set->GetFloatingPointRegisters() & (~callee_saved_fp_registers.list()));
+  CPURegList fp_list = CPURegList(
+      CPURegister::kFPRegister,
+      kDRegSize,
+      register_set->GetFloatingPointRegisters()
+          & (~(codegen->GetGraph()->IsDebuggable() ? 0 : callee_saved_fp_registers.list())));
 
   MacroAssembler* masm = down_cast<CodeGeneratorARM64*>(codegen)->GetVIXLAssembler();
   UseScratchRegisterScope temps(masm);
@@ -534,7 +537,9 @@
                     kNumberOfAllocatableFPRegisters,
                     kNumberOfAllocatableRegisterPairs,
                     callee_saved_core_registers.list(),
-                    callee_saved_fp_registers.list(),
+                    // If the graph is debuggable, we need to save the fpu registers ourselves,
+                    // as the stubs do not do it.
+                    graph->IsDebuggable() ? 0 : callee_saved_fp_registers.list(),
                     compiler_options,
                     stats),
       block_labels_(nullptr),
@@ -542,11 +547,14 @@
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
       isa_features_(isa_features),
-      uint64_literals_(std::less<uint64_t>(), graph->GetArena()->Adapter()),
-      method_patches_(MethodReferenceComparator(), graph->GetArena()->Adapter()),
-      call_patches_(MethodReferenceComparator(), graph->GetArena()->Adapter()),
-      relative_call_patches_(graph->GetArena()->Adapter()),
-      pc_rel_dex_cache_patches_(graph->GetArena()->Adapter()) {
+      uint64_literals_(std::less<uint64_t>(),
+                       graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      method_patches_(MethodReferenceComparator(),
+                      graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      call_patches_(MethodReferenceComparator(),
+                    graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      pc_rel_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Save the link register (containing the return address) to mimic Quick.
   AddAllocatedRegister(LocationFrom(lr));
 }
@@ -610,7 +618,8 @@
 }
 
 void ParallelMoveResolverARM64::EmitMove(size_t index) {
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   codegen_->MoveLocation(move->GetDestination(), move->GetSource());
 }
 
diff --git a/compiler/optimizing/code_generator_arm64.h b/compiler/optimizing/code_generator_arm64.h
index 8967108..7178081 100644
--- a/compiler/optimizing/code_generator_arm64.h
+++ b/compiler/optimizing/code_generator_arm64.h
@@ -326,12 +326,7 @@
   }
 
   void Initialize() OVERRIDE {
-    HGraph* graph = GetGraph();
-    int length = graph->GetBlocks().size();
-    block_labels_ = graph->GetArena()->AllocArray<vixl::Label>(length);
-    for (int i = 0; i < length; ++i) {
-      new(block_labels_ + i) vixl::Label();
-    }
+    block_labels_ = CommonInitializeLabels<vixl::Label>();
   }
 
   void Finalize(CodeAllocator* allocator) OVERRIDE;
@@ -400,7 +395,7 @@
   };
 
   // Labels for each block that will be compiled.
-  vixl::Label* block_labels_;
+  vixl::Label* block_labels_;  // Indexed by block id.
   vixl::Label frame_entry_label_;
 
   LocationsBuilderARM64 location_builder_;
diff --git a/compiler/optimizing/code_generator_mips64.cc b/compiler/optimizing/code_generator_mips64.cc
index 25ef388..c9f8493 100644
--- a/compiler/optimizing/code_generator_mips64.cc
+++ b/compiler/optimizing/code_generator_mips64.cc
@@ -431,7 +431,7 @@
                                         arraysize(kFpuCalleeSaves)),
                     compiler_options,
                     stats),
-      block_labels_(graph->GetArena(), 0),
+      block_labels_(nullptr),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
@@ -453,12 +453,14 @@
 }
 
 void ParallelMoveResolverMIPS64::EmitMove(size_t index) {
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   codegen_->MoveLocation(move->GetDestination(), move->GetSource(), move->GetType());
 }
 
 void ParallelMoveResolverMIPS64::EmitSwap(size_t index) {
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   codegen_->SwapLocations(move->GetDestination(), move->GetSource(), move->GetType());
 }
 
diff --git a/compiler/optimizing/code_generator_mips64.h b/compiler/optimizing/code_generator_mips64.h
index f66ecb3..16461d6 100644
--- a/compiler/optimizing/code_generator_mips64.h
+++ b/compiler/optimizing/code_generator_mips64.h
@@ -270,11 +270,11 @@
   }
 
   Label* GetLabelOf(HBasicBlock* block) const {
-    return CommonGetLabelOf<Label>(block_labels_.GetRawStorage(), block);
+    return CommonGetLabelOf<Label>(block_labels_, block);
   }
 
   void Initialize() OVERRIDE {
-    block_labels_.SetSize(GetGraph()->GetBlocks().size());
+    block_labels_ = CommonInitializeLabels<Label>();
   }
 
   void Finalize(CodeAllocator* allocator) OVERRIDE;
@@ -315,7 +315,7 @@
 
  private:
   // Labels for each block that will be compiled.
-  GrowableArray<Label> block_labels_;
+  Label* block_labels_;  // Indexed by block id.
   Label frame_entry_label_;
   LocationsBuilderMIPS64 location_builder_;
   InstructionCodeGeneratorMIPS64 instruction_visitor_;
diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc
index ab3d1d1..a47a95e 100644
--- a/compiler/optimizing/code_generator_x86.cc
+++ b/compiler/optimizing/code_generator_x86.cc
@@ -470,13 +470,13 @@
                     0,
                     compiler_options,
                     stats),
-      block_labels_(graph->GetArena(), 0),
+      block_labels_(nullptr),
       location_builder_(graph, this),
       instruction_visitor_(graph, this),
       move_resolver_(graph->GetArena(), this),
       isa_features_(isa_features),
-      method_patches_(graph->GetArena()->Adapter()),
-      relative_call_patches_(graph->GetArena()->Adapter()) {
+      method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+      relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   // Use a fake return address register to mimic Quick.
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
@@ -4630,7 +4630,8 @@
 }
 
 void ParallelMoveResolverX86::EmitMove(size_t index) {
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   Location source = move->GetSource();
   Location destination = move->GetDestination();
 
@@ -4782,7 +4783,8 @@
 }
 
 void ParallelMoveResolverX86::EmitSwap(size_t index) {
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   Location source = move->GetSource();
   Location destination = move->GetDestination();
 
@@ -5621,7 +5623,7 @@
 /**
  * Class to handle late fixup of offsets into constant area.
  */
-class RIPFixup : public AssemblerFixup, public ArenaObject<kArenaAllocMisc> {
+class RIPFixup : public AssemblerFixup, public ArenaObject<kArenaAllocCodeGenerator> {
  public:
   RIPFixup(const CodeGeneratorX86& codegen, int offset)
       : codegen_(codegen), offset_into_constant_area_(offset) {}
diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h
index f38e1ea..2c2fc65 100644
--- a/compiler/optimizing/code_generator_x86.h
+++ b/compiler/optimizing/code_generator_x86.h
@@ -316,11 +316,11 @@
                   bool value_can_be_null);
 
   Label* GetLabelOf(HBasicBlock* block) const {
-    return CommonGetLabelOf<Label>(block_labels_.GetRawStorage(), block);
+    return CommonGetLabelOf<Label>(block_labels_, block);
   }
 
   void Initialize() OVERRIDE {
-    block_labels_.SetSize(GetGraph()->GetBlocks().size());
+    block_labels_ = CommonInitializeLabels<Label>();
   }
 
   bool NeedsTwoRegisters(Primitive::Type type) const OVERRIDE {
@@ -356,7 +356,7 @@
 
  private:
   // Labels for each block that will be compiled.
-  GrowableArray<Label> block_labels_;
+  Label* block_labels_;  // Indexed by block id.
   Label frame_entry_label_;
   LocationsBuilderX86 location_builder_;
   InstructionCodeGeneratorX86 instruction_visitor_;
diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc
index cfce7a0..b845a27 100644
--- a/compiler/optimizing/code_generator_x86_64.cc
+++ b/compiler/optimizing/code_generator_x86_64.cc
@@ -620,15 +620,15 @@
                                           arraysize(kFpuCalleeSaves)),
                       compiler_options,
                       stats),
-        block_labels_(graph->GetArena(), 0),
+        block_labels_(nullptr),
         location_builder_(graph, this),
         instruction_visitor_(graph, this),
         move_resolver_(graph->GetArena(), this),
         isa_features_(isa_features),
         constant_area_start_(0),
-        method_patches_(graph->GetArena()->Adapter()),
-        relative_call_patches_(graph->GetArena()->Adapter()),
-        pc_rel_dex_cache_patches_(graph->GetArena()->Adapter()) {
+        method_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        relative_call_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)),
+        pc_rel_dex_cache_patches_(graph->GetArena()->Adapter(kArenaAllocCodeGenerator)) {
   AddAllocatedRegister(Location::RegisterLocation(kFakeReturnRegister));
 }
 
@@ -4373,7 +4373,8 @@
 }
 
 void ParallelMoveResolverX86_64::EmitMove(size_t index) {
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   Location source = move->GetSource();
   Location destination = move->GetDestination();
 
@@ -4531,7 +4532,8 @@
 }
 
 void ParallelMoveResolverX86_64::EmitSwap(size_t index) {
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   Location source = move->GetSource();
   Location destination = move->GetDestination();
 
@@ -5277,7 +5279,7 @@
 /**
  * Class to handle late fixup of offsets into constant area.
  */
-class RIPFixup : public AssemblerFixup, public ArenaObject<kArenaAllocMisc> {
+class RIPFixup : public AssemblerFixup, public ArenaObject<kArenaAllocCodeGenerator> {
   public:
     RIPFixup(const CodeGeneratorX86_64& codegen, int offset)
       : codegen_(codegen), offset_into_constant_area_(offset) {}
diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h
index 1ec3580..197ce63 100644
--- a/compiler/optimizing/code_generator_x86_64.h
+++ b/compiler/optimizing/code_generator_x86_64.h
@@ -286,11 +286,11 @@
   void Move(Location destination, Location source);
 
   Label* GetLabelOf(HBasicBlock* block) const {
-    return CommonGetLabelOf<Label>(block_labels_.GetRawStorage(), block);
+    return CommonGetLabelOf<Label>(block_labels_, block);
   }
 
   void Initialize() OVERRIDE {
-    block_labels_.SetSize(GetGraph()->GetBlocks().size());
+    block_labels_ = CommonInitializeLabels<Label>();
   }
 
   bool NeedsTwoRegisters(Primitive::Type type ATTRIBUTE_UNUSED) const OVERRIDE {
@@ -334,7 +334,7 @@
   };
 
   // Labels for each block that will be compiled.
-  GrowableArray<Label> block_labels_;
+  Label* block_labels_;  // Indexed by block id.
   Label frame_entry_label_;
   LocationsBuilderX86_64 location_builder_;
   InstructionCodeGeneratorX86_64 instruction_visitor_;
diff --git a/compiler/optimizing/constant_folding.cc b/compiler/optimizing/constant_folding.cc
index 20ce110..e0aa4ff 100644
--- a/compiler/optimizing/constant_folding.cc
+++ b/compiler/optimizing/constant_folding.cc
@@ -226,14 +226,14 @@
 
   // We assume that GVN has run before, so we only perform a pointer
   // comparison.  If for some reason the values are equal but the pointers are
-  // different, we are still correct and only miss an optimisation
+  // different, we are still correct and only miss an optimization
   // opportunity.
   if (instruction->GetLeft() == instruction->GetRight()) {
     // Replace code looking like
     //    SUB dst, src, src
     // with
     //    CONSTANT 0
-    // Note that we cannot optimise `x - x` to `0` for floating-point. It does
+    // Note that we cannot optimize `x - x` to `0` for floating-point. It does
     // not work when `x` is an infinity.
     instruction->ReplaceWith(GetGraph()->GetConstant(type, 0));
     block->RemoveInstruction(instruction);
diff --git a/compiler/optimizing/graph_visualizer.cc b/compiler/optimizing/graph_visualizer.cc
index d05c514..2c6c3b7 100644
--- a/compiler/optimizing/graph_visualizer.cc
+++ b/compiler/optimizing/graph_visualizer.cc
@@ -374,6 +374,11 @@
         << instance_of->MustDoNullCheck() << std::noboolalpha;
   }
 
+  void VisitArraySet(HArraySet* array_set) OVERRIDE {
+    StartAttributeStream("value_can_be_null") << std::boolalpha
+        << array_set->GetValueCanBeNull() << std::noboolalpha;
+  }
+
   void VisitInvoke(HInvoke* invoke) OVERRIDE {
     StartAttributeStream("dex_file_index") << invoke->GetDexMethodIndex();
     StartAttributeStream("method_name") << PrettyMethod(
diff --git a/compiler/optimizing/gvn.cc b/compiler/optimizing/gvn.cc
index 5050e15..7cf0617 100644
--- a/compiler/optimizing/gvn.cc
+++ b/compiler/optimizing/gvn.cc
@@ -39,7 +39,7 @@
   explicit ValueSet(ArenaAllocator* allocator)
       : allocator_(allocator),
         num_buckets_(kMinimumNumberOfBuckets),
-        buckets_(allocator->AllocArray<Node*>(num_buckets_)),
+        buckets_(allocator->AllocArray<Node*>(num_buckets_, kArenaAllocGvn)),
         buckets_owned_(allocator, num_buckets_, false),
         num_entries_(0) {
     // ArenaAllocator returns zeroed memory, so no need to set buckets to null.
@@ -52,7 +52,7 @@
   ValueSet(ArenaAllocator* allocator, const ValueSet& to_copy)
       : allocator_(allocator),
         num_buckets_(to_copy.IdealBucketCount()),
-        buckets_(allocator->AllocArray<Node*>(num_buckets_)),
+        buckets_(allocator->AllocArray<Node*>(num_buckets_, kArenaAllocGvn)),
         buckets_owned_(allocator, num_buckets_, false),
         num_entries_(to_copy.num_entries_) {
     // ArenaAllocator returns zeroed memory, so entries of buckets_ and
diff --git a/compiler/optimizing/induction_var_analysis.cc b/compiler/optimizing/induction_var_analysis.cc
index 9fb4304..e5123de 100644
--- a/compiler/optimizing/induction_var_analysis.cc
+++ b/compiler/optimizing/induction_var_analysis.cc
@@ -74,11 +74,14 @@
 HInductionVarAnalysis::HInductionVarAnalysis(HGraph* graph)
     : HOptimization(graph, kInductionPassName),
       global_depth_(0),
-      stack_(graph->GetArena()->Adapter()),
-      scc_(graph->GetArena()->Adapter()),
-      map_(std::less<HInstruction*>(), graph->GetArena()->Adapter()),
-      cycle_(std::less<HInstruction*>(), graph->GetArena()->Adapter()),
-      induction_(std::less<HLoopInformation*>(), graph->GetArena()->Adapter()) {
+      stack_(graph->GetArena()->Adapter(kArenaAllocInductionVarAnalysis)),
+      scc_(graph->GetArena()->Adapter(kArenaAllocInductionVarAnalysis)),
+      map_(std::less<HInstruction*>(),
+           graph->GetArena()->Adapter(kArenaAllocInductionVarAnalysis)),
+      cycle_(std::less<HInstruction*>(),
+             graph->GetArena()->Adapter(kArenaAllocInductionVarAnalysis)),
+      induction_(std::less<HLoopInformation*>(),
+                 graph->GetArena()->Adapter(kArenaAllocInductionVarAnalysis)) {
 }
 
 void HInductionVarAnalysis::Run() {
@@ -228,7 +231,7 @@
 
   // Rotate proper entry-phi to front.
   if (size > 1) {
-    ArenaVector<HInstruction*> other(graph_->GetArena()->Adapter());
+    ArenaVector<HInstruction*> other(graph_->GetArena()->Adapter(kArenaAllocInductionVarAnalysis));
     RotateEntryPhiFirst(loop, &scc_, &other);
   }
 
@@ -553,44 +556,33 @@
     }
   } else if (a->induction_class == kLinear && b->induction_class == kInvariant) {
     // Analyze condition with induction at left-hand-side (e.g. i < U).
+    InductionInfo* lower_expr = a->op_b;
+    InductionInfo* upper_expr = b;
     InductionInfo* stride = a->op_a;
-    InductionInfo* lo_val = a->op_b;
-    InductionInfo* hi_val = b;
-    // Analyze stride (may be compound).
-    InductionVarRange::Value v1 = InductionVarRange::GetVal(stride, nullptr, /* is_min */ true);
-    InductionVarRange::Value v2 = InductionVarRange::GetVal(stride, nullptr, /* is_min */ false);
-    if (v1.a_constant != 0 || v2.a_constant != 0 || v1.b_constant != v2.b_constant) {
+    int64_t stride_value = 0;
+    if (!IsIntAndGet(stride, &stride_value)) {
       return;
     }
-    // Rewrite safe condition i != U with unit stride into i < U or i > U
-    // (unit stride guarantees that the end condition is always reached).
-    const int32_t stride_value = v1.b_constant;
-    int64_t lo_value = 0;
-    int64_t hi_value = 0;
-    if (cmp == kCondNE && IsIntAndGet(lo_val, &lo_value) && IsIntAndGet(hi_val, &hi_value)) {
-      if ((stride_value == +1 && lo_value < hi_value) ||
-          (stride_value == -1 && lo_value > hi_value)) {
-        cmp = stride_value > 0 ? kCondLT : kCondGT;
-      }
+    // Rewrite condition i != U into i < U or i > U if end condition is reached exactly.
+    if (cmp == kCondNE && ((stride_value == +1 && IsTaken(lower_expr, upper_expr, kCondLT)) ||
+                           (stride_value == -1 && IsTaken(lower_expr, upper_expr, kCondGT)))) {
+      cmp = stride_value > 0 ? kCondLT : kCondGT;
     }
     // Normalize a linear loop control with a nonzero stride:
     //   stride > 0, either i < U or i <= U
     //   stride < 0, either i > U or i >= U
-    //
-    // TODO: construct conditions for constant/symbolic safety of trip-count
-    //
     if ((stride_value > 0 && (cmp == kCondLT || cmp == kCondLE)) ||
         (stride_value < 0 && (cmp == kCondGT || cmp == kCondGE))) {
-      VisitTripCount(loop, lo_val, hi_val, stride, stride_value, type, cmp);
+      VisitTripCount(loop, lower_expr, upper_expr, stride, stride_value, type, cmp);
     }
   }
 }
 
 void HInductionVarAnalysis::VisitTripCount(HLoopInformation* loop,
-                                           InductionInfo* lo_val,
-                                           InductionInfo* hi_val,
+                                           InductionInfo* lower_expr,
+                                           InductionInfo* upper_expr,
                                            InductionInfo* stride,
-                                           int32_t stride_value,
+                                           int64_t stride_value,
                                            Primitive::Type type,
                                            IfCondition cmp) {
   // Any loop of the general form:
@@ -604,30 +596,95 @@
   //    for (n = 0; n < TC; n++) // where TC = (U + S - L) / S
   //      .. L + S * n ..
   //
-  // NOTE: The TC (trip-count) expression is only valid when safe. Otherwise TC is 0
-  //       (or possibly infinite). Also, the expression assumes the loop does not have
-  //       early-exits. Otherwise, TC is an upper bound.
+  // taking the following into consideration:
   //
-  bool cancels = (cmp == kCondLT || cmp == kCondGT) && std::abs(stride_value) == 1;
+  // (1) Using the same precision, the TC (trip-count) expression should be interpreted as
+  //     an unsigned entity, for example, as in the following loop that uses the full range:
+  //     for (int i = INT_MIN; i < INT_MAX; i++) // TC = UINT_MAX
+  // (2) The TC is only valid if the loop is taken, otherwise TC = 0, as in:
+  //     for (int i = 12; i < U; i++) // TC = 0 when U >= 12
+  //     If this cannot be determined at compile-time, the TC is only valid within the
+  //     loop-body proper, not the loop-header unless enforced with an explicit condition.
+  // (3) The TC is only valid if the loop is finite, otherwise TC has no value, as in:
+  //     for (int i = 0; i <= U; i++) // TC = Inf when U = INT_MAX
+  //     If this cannot be determined at compile-time, the TC is only valid when enforced
+  //     with an explicit condition.
+  // (4) For loops which early-exits, the TC forms an upper bound, as in:
+  //     for (int i = 0; i < 10 && ....; i++) // TC <= 10
+  const bool is_taken = IsTaken(lower_expr, upper_expr, cmp);
+  const bool is_finite = IsFinite(upper_expr, stride_value, type, cmp);
+  const bool cancels = (cmp == kCondLT || cmp == kCondGT) && std::abs(stride_value) == 1;
   if (!cancels) {
     // Convert exclusive integral inequality into inclusive integral inequality,
     // viz. condition i < U is i <= U - 1 and condition i > U is i >= U + 1.
     if (cmp == kCondLT) {
-      hi_val = CreateInvariantOp(kSub, hi_val, CreateConstant(1, type));
+      upper_expr = CreateInvariantOp(kSub, upper_expr, CreateConstant(1, type));
     } else if (cmp == kCondGT) {
-      hi_val = CreateInvariantOp(kAdd, hi_val, CreateConstant(1, type));
+      upper_expr = CreateInvariantOp(kAdd, upper_expr, CreateConstant(1, type));
     }
     // Compensate for stride.
-    hi_val = CreateInvariantOp(kAdd, hi_val, stride);
+    upper_expr = CreateInvariantOp(kAdd, upper_expr, stride);
   }
-
+  InductionInfo* trip_count
+      = CreateInvariantOp(kDiv, CreateInvariantOp(kSub, upper_expr, lower_expr), stride);
   // Assign the trip-count expression to the loop control. Clients that use the information
-  // should be aware that the expression is only valid in the loop-body proper (when symbolically
-  // safe), and not yet in the loop-header (unless constant safe). If the loop has any early exits,
-  // the trip-count forms a conservative upper bound on the number of loop iterations.
-  InductionInfo* trip_count =
-      CreateInvariantOp(kDiv, CreateInvariantOp(kSub, hi_val, lo_val), stride);
-  AssignInfo(loop, loop->GetHeader()->GetLastInstruction(), trip_count);
+  // should be aware that the expression is only valid under the conditions listed above.
+  InductionOp tcKind = kTripCountInBodyUnsafe;
+  if (is_taken && is_finite) {
+    tcKind = kTripCountInLoop;
+  } else if (is_finite) {
+    tcKind = kTripCountInBody;
+  } else if (is_taken) {
+    tcKind = kTripCountInLoopUnsafe;
+  }
+  AssignInfo(loop, loop->GetHeader()->GetLastInstruction(), CreateTripCount(tcKind, trip_count));
+}
+
+bool HInductionVarAnalysis::IsTaken(InductionInfo* lower_expr,
+                                    InductionInfo* upper_expr,
+                                    IfCondition cmp) {
+  int64_t lower_value;
+  int64_t upper_value;
+  if (IsIntAndGet(lower_expr, &lower_value) && IsIntAndGet(upper_expr, &upper_value)) {
+    switch (cmp) {
+      case kCondLT: return lower_value <  upper_value;
+      case kCondLE: return lower_value <= upper_value;
+      case kCondGT: return lower_value >  upper_value;
+      case kCondGE: return lower_value >= upper_value;
+      case kCondEQ:
+      case kCondNE: LOG(FATAL) << "CONDITION UNREACHABLE";
+    }
+  }
+  return false;  // not certain, may be untaken
+}
+
+bool HInductionVarAnalysis::IsFinite(InductionInfo* upper_expr,
+                                     int64_t stride_value,
+                                     Primitive::Type type,
+                                     IfCondition cmp) {
+  const int64_t min = type == Primitive::kPrimInt
+      ? std::numeric_limits<int32_t>::min()
+      : std::numeric_limits<int64_t>::min();
+  const int64_t max = type == Primitive::kPrimInt
+        ? std::numeric_limits<int32_t>::max()
+        : std::numeric_limits<int64_t>::max();
+  // Some rules under which it is certain at compile-time that the loop is finite.
+  int64_t value;
+  switch (cmp) {
+    case kCondLT:
+      return stride_value == 1 ||
+          (IsIntAndGet(upper_expr, &value) && value <= (max - stride_value + 1));
+    case kCondLE:
+      return (IsIntAndGet(upper_expr, &value) && value <= (max - stride_value));
+    case kCondGT:
+      return stride_value == -1 ||
+          (IsIntAndGet(upper_expr, &value) && value >= (min - stride_value - 1));
+    case kCondGE:
+      return (IsIntAndGet(upper_expr, &value) && value >= (min - stride_value));
+    case kCondEQ:
+    case kCondNE: LOG(FATAL) << "CONDITION UNREACHABLE";
+  }
+  return false;  // not certain, may be infinite
 }
 
 void HInductionVarAnalysis::AssignInfo(HLoopInformation* loop,
@@ -637,7 +694,8 @@
   if (it == induction_.end()) {
     it = induction_.Put(loop,
                         ArenaSafeMap<HInstruction*, InductionInfo*>(
-                            std::less<HInstruction*>(), graph_->GetArena()->Adapter()));
+                            std::less<HInstruction*>(),
+                            graph_->GetArena()->Adapter(kArenaAllocInductionVarAnalysis)));
   }
   it->second.Put(instruction, info);
 }
@@ -744,13 +802,22 @@
 }
 
 bool HInductionVarAnalysis::IsIntAndGet(InductionInfo* info, int64_t* value) {
-  if (info != nullptr && info->induction_class == kInvariant && info->operation == kFetch) {
-    DCHECK(info->fetch);
-    if (info->fetch->IsIntConstant()) {
-      *value = info->fetch->AsIntConstant()->GetValue();
-      return true;
-    } else if (info->fetch->IsLongConstant()) {
-      *value = info->fetch->AsLongConstant()->GetValue();
+  if (info != nullptr && info->induction_class == kInvariant) {
+    // A direct constant fetch.
+    if (info->operation == kFetch) {
+      DCHECK(info->fetch);
+      if (info->fetch->IsIntConstant()) {
+        *value = info->fetch->AsIntConstant()->GetValue();
+        return true;
+      } else if (info->fetch->IsLongConstant()) {
+        *value = info->fetch->AsLongConstant()->GetValue();
+        return true;
+      }
+    }
+    // Use range analysis to resolve compound values.
+    int32_t range_value;
+    if (InductionVarRange::GetConstant(info, &range_value)) {
+      *value = range_value;
       return true;
     }
   }
@@ -778,6 +845,10 @@
             inv += std::to_string(info->fetch->GetId()) + ":" + info->fetch->DebugName();
           }
           break;
+        case kTripCountInLoop:       inv += "TC-loop:"; break;
+        case kTripCountInBody:       inv += "TC-body:"; break;
+        case kTripCountInLoopUnsafe: inv += "TC-loop-unsafe:"; break;
+        case kTripCountInBodyUnsafe: inv += "TC-body-unsafe:"; break;
       }
       inv += InductionToString(info->op_b);
       return inv + ")";
diff --git a/compiler/optimizing/induction_var_analysis.h b/compiler/optimizing/induction_var_analysis.h
index 190a0db..7ab80cd 100644
--- a/compiler/optimizing/induction_var_analysis.h
+++ b/compiler/optimizing/induction_var_analysis.h
@@ -56,13 +56,20 @@
   };
 
   enum InductionOp {
-    kNop,  // no-operation: a true induction
+    // No-operation: a true induction.
+    kNop,
+    // Various invariant operations.
     kAdd,
     kSub,
     kNeg,
     kMul,
     kDiv,
-    kFetch
+    kFetch,
+    // Trip counts (valid in full loop or only body proper; unsafe implies loop may be infinite).
+    kTripCountInLoop,
+    kTripCountInBody,
+    kTripCountInLoopUnsafe,
+    kTripCountInBodyUnsafe
   };
 
   /**
@@ -77,8 +84,10 @@
    *         nop: a, then defined by b
    *   (4) periodic
    *         nop: a, then defined by b (repeated when exhausted)
+   *   (5) trip-count:
+   *         tc: defined by b
    */
-  struct InductionInfo : public ArenaObject<kArenaAllocMisc> {
+  struct InductionInfo : public ArenaObject<kArenaAllocInductionVarAnalysis> {
     InductionInfo(InductionClass ic,
                   InductionOp op,
                   InductionInfo* a,
@@ -110,6 +119,10 @@
     return new (graph_->GetArena()) InductionInfo(kInvariant, kFetch, nullptr, nullptr, f);
   }
 
+  InductionInfo* CreateTripCount(InductionOp op, InductionInfo* b) {
+    return new (graph_->GetArena()) InductionInfo(kInvariant, op, nullptr, b, nullptr);
+  }
+
   InductionInfo* CreateInduction(InductionClass ic, InductionInfo* a, InductionInfo* b) {
     DCHECK(a != nullptr && b != nullptr);
     return new (graph_->GetArena()) InductionInfo(ic, kNop, a, b, nullptr);
@@ -151,12 +164,17 @@
                       Primitive::Type type,
                       IfCondition cmp);
   void VisitTripCount(HLoopInformation* loop,
-                      InductionInfo* lo_val,
-                      InductionInfo* hi_val,
+                      InductionInfo* lower_expr,
+                      InductionInfo* upper_expr,
                       InductionInfo* stride,
-                      int32_t stride_value,
+                      int64_t stride_value,
                       Primitive::Type type,
                       IfCondition cmp);
+  bool IsTaken(InductionInfo* lower_expr, InductionInfo* upper_expr, IfCondition cmp);
+  bool IsFinite(InductionInfo* upper_expr,
+                int64_t stride_value,
+                Primitive::Type type,
+                IfCondition cmp);
 
   // Assign and lookup.
   void AssignInfo(HLoopInformation* loop, HInstruction* instruction, InductionInfo* info);
diff --git a/compiler/optimizing/induction_var_analysis_test.cc b/compiler/optimizing/induction_var_analysis_test.cc
index e519e77..20492e7 100644
--- a/compiler/optimizing/induction_var_analysis_test.cc
+++ b/compiler/optimizing/induction_var_analysis_test.cc
@@ -234,7 +234,8 @@
   EXPECT_STREQ("((1) * i + (1))", GetInductionInfo(increment_[0], 0).c_str());
 
   // Trip-count.
-  EXPECT_STREQ("(100)", GetInductionInfo(loop_header_[0]->GetLastInstruction(), 0).c_str());
+  EXPECT_STREQ("(TC-loop:(100))",
+               GetInductionInfo(loop_header_[0]->GetLastInstruction(), 0).c_str());
 }
 
 TEST_F(InductionVarAnalysisTest, FindDerivedInduction) {
@@ -543,8 +544,10 @@
   InductionVarRange range(iva_);
   InductionVarRange::Value v_min = range.GetMinInduction(store, store->InputAt(1));
   InductionVarRange::Value v_max = range.GetMaxInduction(store, store->InputAt(1));
+  ASSERT_TRUE(v_min.is_known);
   EXPECT_EQ(0, v_min.a_constant);
   EXPECT_EQ(1, v_min.b_constant);
+  ASSERT_TRUE(v_max.is_known);
   EXPECT_EQ(0, v_max.a_constant);
   EXPECT_EQ(199, v_max.b_constant);
 }
@@ -579,7 +582,8 @@
     }
     EXPECT_STREQ("((1) * i + (1))", GetInductionInfo(increment_[d], d).c_str());
     // Trip-count.
-    EXPECT_STREQ("(100)", GetInductionInfo(loop_header_[d]->GetLastInstruction(), d).c_str());
+    EXPECT_STREQ("(TC-loop:(100))",
+                 GetInductionInfo(loop_header_[d]->GetLastInstruction(), d).c_str());
   }
 }
 
diff --git a/compiler/optimizing/induction_var_range.cc b/compiler/optimizing/induction_var_range.cc
index 119a80b..db12819 100644
--- a/compiler/optimizing/induction_var_range.cc
+++ b/compiler/optimizing/induction_var_range.cc
@@ -86,51 +86,36 @@
 
 InductionVarRange::Value InductionVarRange::GetMinInduction(HInstruction* context,
                                                             HInstruction* instruction) {
-  HLoopInformation* loop = context->GetBlock()->GetLoopInformation();
-  if (loop != nullptr) {
-    return GetVal(induction_analysis_->LookupInfo(loop, instruction),
-                  GetTripCount(loop, context), /* is_min */ true);
-  }
-  return Value();
+  return GetInduction(context, instruction, /* is_min */ true);
 }
 
 InductionVarRange::Value InductionVarRange::GetMaxInduction(HInstruction* context,
                                                             HInstruction* instruction) {
-  HLoopInformation* loop = context->GetBlock()->GetLoopInformation();
-  if (loop != nullptr) {
-    return SimplifyMax(
-        GetVal(induction_analysis_->LookupInfo(loop, instruction),
-               GetTripCount(loop, context), /* is_min */ false));
-  }
-  return Value();
+  return SimplifyMax(GetInduction(context, instruction, /* is_min */ false));
 }
 
 //
 // Private class methods.
 //
 
-HInductionVarAnalysis::InductionInfo* InductionVarRange::GetTripCount(HLoopInformation* loop,
-                                                                      HInstruction* context) {
-  // The trip-count expression is only valid when the top-test is taken at least once,
-  // that means, when the analyzed context appears outside the loop header itself.
-  // Early-exit loops are okay, since in those cases, the trip-count is conservative.
-  //
-  // TODO: deal with runtime safety issues on TCs
-  //
-  if (context->GetBlock() != loop->GetHeader()) {
-    HInductionVarAnalysis::InductionInfo* trip =
-        induction_analysis_->LookupInfo(loop, loop->GetHeader()->GetLastInstruction());
-    if (trip != nullptr) {
-      // Wrap the trip-count representation in its own unusual NOP node, so that range analysis
-      // is able to determine the [0, TC - 1] interval without having to construct constants.
-      return induction_analysis_->CreateInvariantOp(HInductionVarAnalysis::kNop, trip, trip);
-    }
+InductionVarRange::Value InductionVarRange::GetInduction(HInstruction* context,
+                                                         HInstruction* instruction,
+                                                         bool is_min) {
+  HLoopInformation* loop = context->GetBlock()->GetLoopInformation();  // closest enveloping loop
+  if (loop != nullptr) {
+    HBasicBlock* header = loop->GetHeader();
+    bool in_body = context->GetBlock() != header;
+    return GetVal(induction_analysis_->LookupInfo(loop, instruction),
+                  induction_analysis_->LookupInfo(loop, header->GetLastInstruction()),
+                  in_body,
+                  is_min);
   }
-  return nullptr;
+  return Value();
 }
 
 InductionVarRange::Value InductionVarRange::GetFetch(HInstruction* instruction,
                                                      HInductionVarAnalysis::InductionInfo* trip,
+                                                     bool in_body,
                                                      bool is_min) {
   // Detect constants and chase the fetch a bit deeper into the HIR tree, so that it becomes
   // more likely range analysis will compare the same instructions as terminal nodes.
@@ -139,13 +124,13 @@
     return Value(value);
   } else if (instruction->IsAdd()) {
     if (IsIntAndGet(instruction->InputAt(0), &value)) {
-      return AddValue(Value(value), GetFetch(instruction->InputAt(1), trip, is_min));
+      return AddValue(Value(value), GetFetch(instruction->InputAt(1), trip, in_body, is_min));
     } else if (IsIntAndGet(instruction->InputAt(1), &value)) {
-      return AddValue(GetFetch(instruction->InputAt(0), trip, is_min), Value(value));
+      return AddValue(GetFetch(instruction->InputAt(0), trip, in_body, is_min), Value(value));
     }
   } else if (is_min) {
-    // Special case for finding minimum: minimum of trip-count is 1.
-    if (trip != nullptr && instruction == trip->op_b->fetch) {
+    // Special case for finding minimum: minimum of trip-count in loop-body is 1.
+    if (trip != nullptr && in_body && instruction == trip->op_b->fetch) {
       return Value(1);
     }
   }
@@ -154,42 +139,53 @@
 
 InductionVarRange::Value InductionVarRange::GetVal(HInductionVarAnalysis::InductionInfo* info,
                                                    HInductionVarAnalysis::InductionInfo* trip,
+                                                   bool in_body,
                                                    bool is_min) {
   if (info != nullptr) {
     switch (info->induction_class) {
       case HInductionVarAnalysis::kInvariant:
         // Invariants.
         switch (info->operation) {
-          case HInductionVarAnalysis::kNop:  // normalized: 0 or TC-1
-            DCHECK_EQ(info->op_a, info->op_b);
-            return is_min ? Value(0)
-                          : SubValue(GetVal(info->op_b, trip, is_min), Value(1));
           case HInductionVarAnalysis::kAdd:
-            return AddValue(GetVal(info->op_a, trip, is_min),
-                            GetVal(info->op_b, trip, is_min));
+            return AddValue(GetVal(info->op_a, trip, in_body, is_min),
+                            GetVal(info->op_b, trip, in_body, is_min));
           case HInductionVarAnalysis::kSub:  // second reversed!
-            return SubValue(GetVal(info->op_a, trip, is_min),
-                            GetVal(info->op_b, trip, !is_min));
+            return SubValue(GetVal(info->op_a, trip, in_body, is_min),
+                            GetVal(info->op_b, trip, in_body, !is_min));
           case HInductionVarAnalysis::kNeg:  // second reversed!
             return SubValue(Value(0),
-                            GetVal(info->op_b, trip, !is_min));
+                            GetVal(info->op_b, trip, in_body, !is_min));
           case HInductionVarAnalysis::kMul:
-            return GetMul(info->op_a, info->op_b, trip, is_min);
+            return GetMul(info->op_a, info->op_b, trip, in_body, is_min);
           case HInductionVarAnalysis::kDiv:
-            return GetDiv(info->op_a, info->op_b, trip, is_min);
+            return GetDiv(info->op_a, info->op_b, trip, in_body, is_min);
           case HInductionVarAnalysis::kFetch:
-            return GetFetch(info->fetch, trip, is_min);
+            return GetFetch(info->fetch, trip, in_body, is_min);
+          case HInductionVarAnalysis::kTripCountInLoop:
+            if (!in_body) {
+              return is_min ? Value(0)
+                            : GetVal(info->op_b, trip, in_body, is_min);   // one extra!
+            }
+            FALLTHROUGH_INTENDED;
+          case HInductionVarAnalysis::kTripCountInBody:
+            if (in_body) {
+              return is_min ? Value(0)
+                            : SubValue(GetVal(info->op_b, trip, in_body, is_min), Value(1));
+            }
+            break;
+          default:
+            break;
         }
         break;
       case HInductionVarAnalysis::kLinear:
         // Linear induction a * i + b, for normalized 0 <= i < TC.
-        return AddValue(GetMul(info->op_a, trip, trip, is_min),
-                        GetVal(info->op_b, trip, is_min));
+        return AddValue(GetMul(info->op_a, trip, trip, in_body, is_min),
+                        GetVal(info->op_b, trip, in_body, is_min));
       case HInductionVarAnalysis::kWrapAround:
       case HInductionVarAnalysis::kPeriodic:
         // Merge values in the wrap-around/periodic.
-        return MergeVal(GetVal(info->op_a, trip, is_min),
-                        GetVal(info->op_b, trip, is_min), is_min);
+        return MergeVal(GetVal(info->op_a, trip, in_body, is_min),
+                        GetVal(info->op_b, trip, in_body, is_min), is_min);
     }
   }
   return Value();
@@ -198,11 +194,12 @@
 InductionVarRange::Value InductionVarRange::GetMul(HInductionVarAnalysis::InductionInfo* info1,
                                                    HInductionVarAnalysis::InductionInfo* info2,
                                                    HInductionVarAnalysis::InductionInfo* trip,
+                                                   bool in_body,
                                                    bool is_min) {
-  Value v1_min = GetVal(info1, trip, /* is_min */ true);
-  Value v1_max = GetVal(info1, trip, /* is_min */ false);
-  Value v2_min = GetVal(info2, trip, /* is_min */ true);
-  Value v2_max = GetVal(info2, trip, /* is_min */ false);
+  Value v1_min = GetVal(info1, trip, in_body, /* is_min */ true);
+  Value v1_max = GetVal(info1, trip, in_body, /* is_min */ false);
+  Value v2_min = GetVal(info2, trip, in_body, /* is_min */ true);
+  Value v2_max = GetVal(info2, trip, in_body, /* is_min */ false);
   if (v1_min.is_known && v1_min.a_constant == 0 && v1_min.b_constant >= 0) {
     // Positive range vs. positive or negative range.
     if (v2_min.is_known && v2_min.a_constant == 0 && v2_min.b_constant >= 0) {
@@ -228,11 +225,12 @@
 InductionVarRange::Value InductionVarRange::GetDiv(HInductionVarAnalysis::InductionInfo* info1,
                                                    HInductionVarAnalysis::InductionInfo* info2,
                                                    HInductionVarAnalysis::InductionInfo* trip,
+                                                   bool in_body,
                                                    bool is_min) {
-  Value v1_min = GetVal(info1, trip, /* is_min */ true);
-  Value v1_max = GetVal(info1, trip, /* is_min */ false);
-  Value v2_min = GetVal(info2, trip, /* is_min */ true);
-  Value v2_max = GetVal(info2, trip, /* is_min */ false);
+  Value v1_min = GetVal(info1, trip, in_body, /* is_min */ true);
+  Value v1_max = GetVal(info1, trip, in_body, /* is_min */ false);
+  Value v2_min = GetVal(info2, trip, in_body, /* is_min */ true);
+  Value v2_max = GetVal(info2, trip, in_body, /* is_min */ false);
   if (v1_min.is_known && v1_min.a_constant == 0 && v1_min.b_constant >= 0) {
     // Positive range vs. positive or negative range.
     if (v2_min.is_known && v2_min.a_constant == 0 && v2_min.b_constant >= 0) {
@@ -255,6 +253,16 @@
   return Value();
 }
 
+bool InductionVarRange::GetConstant(HInductionVarAnalysis::InductionInfo* info, int32_t *value) {
+  Value v_min = GetVal(info, nullptr, false, /* is_min */ true);
+  Value v_max = GetVal(info, nullptr, false, /* is_min */ false);
+  if (v_min.a_constant == 0 && v_max.a_constant == 0 && v_min.b_constant == v_max.b_constant) {
+    *value = v_min.b_constant;
+    return true;
+  }
+  return false;
+}
+
 InductionVarRange::Value InductionVarRange::AddValue(Value v1, Value v2) {
   if (v1.is_known && v2.is_known && IsSafeAdd(v1.b_constant, v2.b_constant)) {
     const int32_t b = v1.b_constant + v2.b_constant;
diff --git a/compiler/optimizing/induction_var_range.h b/compiler/optimizing/induction_var_range.h
index 8280c8b..dbdd2ee 100644
--- a/compiler/optimizing/induction_var_range.h
+++ b/compiler/optimizing/induction_var_range.h
@@ -73,24 +73,29 @@
   // Private helper methods.
   //
 
-  HInductionVarAnalysis::InductionInfo* GetTripCount(HLoopInformation* loop, HInstruction* context);
+  Value GetInduction(HInstruction* context, HInstruction* instruction, bool is_min);
 
   static Value GetFetch(HInstruction* instruction,
                         HInductionVarAnalysis::InductionInfo* trip,
+                        bool in_body,
                         bool is_min);
-
   static Value GetVal(HInductionVarAnalysis::InductionInfo* info,
                       HInductionVarAnalysis::InductionInfo* trip,
+                      bool in_body,
                       bool is_min);
   static Value GetMul(HInductionVarAnalysis::InductionInfo* info1,
                       HInductionVarAnalysis::InductionInfo* info2,
                       HInductionVarAnalysis::InductionInfo* trip,
+                      bool in_body,
                       bool is_min);
   static Value GetDiv(HInductionVarAnalysis::InductionInfo* info1,
                       HInductionVarAnalysis::InductionInfo* info2,
                       HInductionVarAnalysis::InductionInfo* trip,
+                      bool in_body,
                       bool is_min);
 
+  static bool GetConstant(HInductionVarAnalysis::InductionInfo* info, int32_t *value);
+
   static Value AddValue(Value v1, Value v2);
   static Value SubValue(Value v1, Value v2);
   static Value MulValue(Value v1, Value v2);
diff --git a/compiler/optimizing/induction_var_range_test.cc b/compiler/optimizing/induction_var_range_test.cc
index 5d9a075..4497a88 100644
--- a/compiler/optimizing/induction_var_range_test.cc
+++ b/compiler/optimizing/induction_var_range_test.cc
@@ -85,8 +85,7 @@
 
   /** Constructs a trip-count. */
   HInductionVarAnalysis::InductionInfo* CreateTripCount(int32_t tc) {
-    HInductionVarAnalysis::InductionInfo* trip = CreateConst(tc);
-    return CreateInvariant('@', trip, trip);
+    return iva_->CreateTripCount(HInductionVarAnalysis::kTripCountInLoop, CreateConst(tc));
   }
 
   /** Constructs a linear a * i + b induction. */
@@ -112,24 +111,28 @@
 
   Value GetMin(HInductionVarAnalysis::InductionInfo* info,
                HInductionVarAnalysis::InductionInfo* induc) {
-    return InductionVarRange::GetVal(info, induc, /* is_min */ true);
+    return InductionVarRange::GetVal(info, induc, /* in_body */ true, /* is_min */ true);
   }
 
   Value GetMax(HInductionVarAnalysis::InductionInfo* info,
                HInductionVarAnalysis::InductionInfo* induc) {
-    return InductionVarRange::GetVal(info, induc, /* is_min */ false);
+    return InductionVarRange::GetVal(info, induc, /* in_body */ true, /* is_min */ false);
   }
 
   Value GetMul(HInductionVarAnalysis::InductionInfo* info1,
                HInductionVarAnalysis::InductionInfo* info2,
                bool is_min) {
-    return InductionVarRange::GetMul(info1, info2, nullptr, is_min);
+    return InductionVarRange::GetMul(info1, info2, nullptr, /* in_body */ true, is_min);
   }
 
   Value GetDiv(HInductionVarAnalysis::InductionInfo* info1,
                HInductionVarAnalysis::InductionInfo* info2,
                bool is_min) {
-    return InductionVarRange::GetDiv(info1, info2, nullptr, is_min);
+    return InductionVarRange::GetDiv(info1, info2, nullptr, /* in_body */ true, is_min);
+  }
+
+  bool GetConstant(HInductionVarAnalysis::InductionInfo* info, int32_t* value) {
+    return InductionVarRange::GetConstant(info, value);
   }
 
   Value AddValue(Value v1, Value v2) { return InductionVarRange::AddValue(v1, v2); }
@@ -279,6 +282,13 @@
   ExpectEqual(Value(500), GetDiv(CreateRange(-1000, -40), CreateRange(-4, -2), false));
 }
 
+TEST_F(InductionVarRangeTest, GetConstant) {
+  int32_t value;
+  ASSERT_TRUE(GetConstant(CreateConst(12345), &value));
+  EXPECT_EQ(12345, value);
+  EXPECT_FALSE(GetConstant(CreateRange(1, 2), &value));
+}
+
 TEST_F(InductionVarRangeTest, AddValue) {
   ExpectEqual(Value(110), AddValue(Value(10), Value(100)));
   ExpectEqual(Value(-5), AddValue(Value(&x_, 1, -4), Value(&x_, -1, -1)));
diff --git a/compiler/optimizing/instruction_simplifier.cc b/compiler/optimizing/instruction_simplifier.cc
index 0ac26de..22bca2f 100644
--- a/compiler/optimizing/instruction_simplifier.cc
+++ b/compiler/optimizing/instruction_simplifier.cc
@@ -71,7 +71,8 @@
   void VisitXor(HXor* instruction) OVERRIDE;
   void VisitInstanceOf(HInstanceOf* instruction) OVERRIDE;
   void VisitFakeString(HFakeString* fake_string) OVERRIDE;
-  bool IsDominatedByInputNullCheck(HInstruction* instr);
+
+  bool CanEnsureNotNullAt(HInstruction* instr, HInstruction* at) const;
 
   OptimizingCompilerStats* stats_;
   bool simplification_occurred_ = false;
@@ -187,14 +188,18 @@
   }
 }
 
-bool InstructionSimplifierVisitor::IsDominatedByInputNullCheck(HInstruction* instr) {
-  HInstruction* input = instr->InputAt(0);
+bool InstructionSimplifierVisitor::CanEnsureNotNullAt(HInstruction* input, HInstruction* at) const {
+  if (!input->CanBeNull()) {
+    return true;
+  }
+
   for (HUseIterator<HInstruction*> it(input->GetUses()); !it.Done(); it.Advance()) {
     HInstruction* use = it.Current()->GetUser();
-    if (use->IsNullCheck() && use->StrictlyDominates(instr)) {
+    if (use->IsNullCheck() && use->StrictlyDominates(at)) {
       return true;
     }
   }
+
   return false;
 }
 
@@ -231,7 +236,7 @@
 
 void InstructionSimplifierVisitor::VisitCheckCast(HCheckCast* check_cast) {
   HInstruction* object = check_cast->InputAt(0);
-  if (!object->CanBeNull() || IsDominatedByInputNullCheck(check_cast)) {
+  if (CanEnsureNotNullAt(object, check_cast)) {
     check_cast->ClearMustDoNullCheck();
   }
 
@@ -267,7 +272,7 @@
 void InstructionSimplifierVisitor::VisitInstanceOf(HInstanceOf* instruction) {
   HInstruction* object = instruction->InputAt(0);
   bool can_be_null = true;
-  if (!object->CanBeNull() || IsDominatedByInputNullCheck(instruction)) {
+  if (CanEnsureNotNullAt(object, instruction)) {
     can_be_null = false;
     instruction->ClearMustDoNullCheck();
   }
@@ -305,14 +310,14 @@
 
 void InstructionSimplifierVisitor::VisitInstanceFieldSet(HInstanceFieldSet* instruction) {
   if ((instruction->GetValue()->GetType() == Primitive::kPrimNot)
-      && !instruction->GetValue()->CanBeNull()) {
+      && CanEnsureNotNullAt(instruction->GetValue(), instruction)) {
     instruction->ClearValueCanBeNull();
   }
 }
 
 void InstructionSimplifierVisitor::VisitStaticFieldSet(HStaticFieldSet* instruction) {
   if ((instruction->GetValue()->GetType() == Primitive::kPrimNot)
-      && !instruction->GetValue()->CanBeNull()) {
+      && CanEnsureNotNullAt(instruction->GetValue(), instruction)) {
     instruction->ClearValueCanBeNull();
   }
 }
@@ -437,7 +442,7 @@
     instruction->ClearNeedsTypeCheck();
   }
 
-  if (!value->CanBeNull()) {
+  if (CanEnsureNotNullAt(value, instruction)) {
     instruction->ClearValueCanBeNull();
   }
 }
@@ -502,14 +507,45 @@
   HConstant* input_cst = instruction->GetConstantRight();
   HInstruction* input_other = instruction->GetLeastConstantLeft();
 
-  if ((input_cst != nullptr) && AreAllBitsSet(input_cst)) {
-    // Replace code looking like
-    //    AND dst, src, 0xFFF...FF
-    // with
-    //    src
-    instruction->ReplaceWith(input_other);
-    instruction->GetBlock()->RemoveInstruction(instruction);
-    return;
+  if (input_cst != nullptr) {
+    int64_t value = Int64FromConstant(input_cst);
+    if (value == -1) {
+      // Replace code looking like
+      //    AND dst, src, 0xFFF...FF
+      // with
+      //    src
+      instruction->ReplaceWith(input_other);
+      instruction->GetBlock()->RemoveInstruction(instruction);
+      RecordSimplification();
+      return;
+    }
+    // Eliminate And from UShr+And if the And-mask contains all the bits that
+    // can be non-zero after UShr. Transform Shr+And to UShr if the And-mask
+    // precisely clears the shifted-in sign bits.
+    if ((input_other->IsUShr() || input_other->IsShr()) && input_other->InputAt(1)->IsConstant()) {
+      size_t reg_bits = (instruction->GetResultType() == Primitive::kPrimLong) ? 64 : 32;
+      size_t shift = Int64FromConstant(input_other->InputAt(1)->AsConstant()) & (reg_bits - 1);
+      size_t num_tail_bits_set = CTZ(value + 1);
+      if ((num_tail_bits_set >= reg_bits - shift) && input_other->IsUShr()) {
+        // This AND clears only bits known to be clear, for example "(x >>> 24) & 0xff".
+        instruction->ReplaceWith(input_other);
+        instruction->GetBlock()->RemoveInstruction(instruction);
+        RecordSimplification();
+        return;
+      }  else if ((num_tail_bits_set == reg_bits - shift) && IsPowerOfTwo(value + 1) &&
+          input_other->HasOnlyOneNonEnvironmentUse()) {
+        DCHECK(input_other->IsShr());  // For UShr, we would have taken the branch above.
+        // Replace SHR+AND with USHR, for example "(x >> 24) & 0xff" -> "x >>> 24".
+        HUShr* ushr = new (GetGraph()->GetArena()) HUShr(instruction->GetType(),
+                                                         input_other->InputAt(0),
+                                                         input_other->InputAt(1),
+                                                         input_other->GetDexPc());
+        instruction->GetBlock()->ReplaceAndRemoveInstructionWith(instruction, ushr);
+        input_other->GetBlock()->RemoveInstruction(input_other);
+        RecordSimplification();
+        return;
+      }
+    }
   }
 
   // We assume that GVN has run before, so we only perform a pointer comparison.
diff --git a/compiler/optimizing/locations.h b/compiler/optimizing/locations.h
index 2eeba18..de4fb7e 100644
--- a/compiler/optimizing/locations.h
+++ b/compiler/optimizing/locations.h
@@ -22,7 +22,6 @@
 #include "base/bit_field.h"
 #include "base/bit_vector.h"
 #include "base/value_object.h"
-#include "utils/growable_array.h"
 
 namespace art {
 
@@ -469,7 +468,7 @@
  * The intent is to have the code for generating the instruction independent of
  * register allocation. A register allocator just has to provide a LocationSummary.
  */
-class LocationSummary : public ArenaObject<kArenaAllocMisc> {
+class LocationSummary : public ArenaObject<kArenaAllocLocationSummary> {
  public:
   enum CallKind {
     kNoCall,
diff --git a/compiler/optimizing/nodes.cc b/compiler/optimizing/nodes.cc
index ef89932..989970f 100644
--- a/compiler/optimizing/nodes.cc
+++ b/compiler/optimizing/nodes.cc
@@ -22,7 +22,6 @@
 #include "base/bit_utils.h"
 #include "base/stl_util.h"
 #include "mirror/class-inl.h"
-#include "utils/growable_array.h"
 #include "scoped_thread_state_change.h"
 
 namespace art {
diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h
index 26df241..486968c 100644
--- a/compiler/optimizing/nodes.h
+++ b/compiler/optimizing/nodes.h
@@ -35,7 +35,6 @@
 #include "offsets.h"
 #include "primitive.h"
 #include "utils/arena_bit_vector.h"
-#include "utils/growable_array.h"
 
 namespace art {
 
@@ -5056,7 +5055,10 @@
 class HParallelMove : public HTemplateInstruction<0> {
  public:
   explicit HParallelMove(ArenaAllocator* arena, uint32_t dex_pc = kNoDexPc)
-      : HTemplateInstruction(SideEffects::None(), dex_pc), moves_(arena, kDefaultNumberOfMoves) {}
+      : HTemplateInstruction(SideEffects::None(), dex_pc),
+        moves_(arena->Adapter(kArenaAllocMoveOperands)) {
+    moves_.reserve(kDefaultNumberOfMoves);
+  }
 
   void AddMove(Location source,
                Location destination,
@@ -5066,15 +5068,15 @@
     DCHECK(destination.IsValid());
     if (kIsDebugBuild) {
       if (instruction != nullptr) {
-        for (size_t i = 0, e = moves_.Size(); i < e; ++i) {
-          if (moves_.Get(i).GetInstruction() == instruction) {
+        for (const MoveOperands& move : moves_) {
+          if (move.GetInstruction() == instruction) {
             // Special case the situation where the move is for the spill slot
             // of the instruction.
             if ((GetPrevious() == instruction)
                 || ((GetPrevious() == nullptr)
                     && instruction->IsPhi()
                     && instruction->GetBlock() == GetBlock())) {
-              DCHECK_NE(destination.GetKind(), moves_.Get(i).GetDestination().GetKind())
+              DCHECK_NE(destination.GetKind(), move.GetDestination().GetKind())
                   << "Doing parallel moves for the same instruction.";
             } else {
               DCHECK(false) << "Doing parallel moves for the same instruction.";
@@ -5082,26 +5084,27 @@
           }
         }
       }
-      for (size_t i = 0, e = moves_.Size(); i < e; ++i) {
-        DCHECK(!destination.OverlapsWith(moves_.Get(i).GetDestination()))
+      for (const MoveOperands& move : moves_) {
+        DCHECK(!destination.OverlapsWith(move.GetDestination()))
             << "Overlapped destination for two moves in a parallel move: "
-            << moves_.Get(i).GetSource() << " ==> " << moves_.Get(i).GetDestination() << " and "
+            << move.GetSource() << " ==> " << move.GetDestination() << " and "
             << source << " ==> " << destination;
       }
     }
-    moves_.Add(MoveOperands(source, destination, type, instruction));
+    moves_.emplace_back(source, destination, type, instruction);
   }
 
-  MoveOperands* MoveOperandsAt(size_t index) const {
-    return moves_.GetRawStorage() + index;
+  MoveOperands* MoveOperandsAt(size_t index) {
+    DCHECK_LT(index, moves_.size());
+    return &moves_[index];
   }
 
-  size_t NumMoves() const { return moves_.Size(); }
+  size_t NumMoves() const { return moves_.size(); }
 
   DECLARE_INSTRUCTION(ParallelMove);
 
  private:
-  GrowableArray<MoveOperands> moves_;
+  ArenaVector<MoveOperands> moves_;
 
   DISALLOW_COPY_AND_ASSIGN(HParallelMove);
 };
diff --git a/compiler/optimizing/optimizing_compiler.cc b/compiler/optimizing/optimizing_compiler.cc
index a2b6131..dbfbd96 100644
--- a/compiler/optimizing/optimizing_compiler.cc
+++ b/compiler/optimizing/optimizing_compiler.cc
@@ -321,8 +321,7 @@
 OptimizingCompiler::OptimizingCompiler(CompilerDriver* driver)
     : Compiler(driver, kMaximumCompilationTimeBeforeWarning),
       run_optimizations_(
-          (driver->GetCompilerOptions().GetCompilerFilter() != CompilerOptions::kTime)
-          && !driver->GetCompilerOptions().GetDebuggable()),
+          driver->GetCompilerOptions().GetCompilerFilter() != CompilerOptions::kTime),
       delegate_(Create(driver, Compiler::Kind::kQuick)) {}
 
 void OptimizingCompiler::Init() {
@@ -575,12 +574,6 @@
                                                      CompilerDriver* compiler_driver,
                                                      const DexCompilationUnit& dex_compilation_unit,
                                                      PassObserver* pass_observer) const {
-  if (graph->HasTryCatch() && graph->IsDebuggable()) {
-    // TODO: b/24054676, stop creating catch phis eagerly to avoid special cases like phis without
-    // inputs.
-    return nullptr;
-  }
-
   ScopedObjectAccess soa(Thread::Current());
   StackHandleScopeCollection handles(soa.Self());
   soa.Self()->TransitionFromRunnableToSuspended(kNative);
diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc
index f9d812f..fce7769 100644
--- a/compiler/optimizing/parallel_move_resolver.cc
+++ b/compiler/optimizing/parallel_move_resolver.cc
@@ -16,6 +16,8 @@
 #include <iostream>
 
 #include "parallel_move_resolver.h"
+
+#include "base/stl_util.h"
 #include "nodes.h"
 
 namespace art {
@@ -28,19 +30,19 @@
   for (size_t i = 0; i < parallel_move->NumMoves(); ++i) {
     MoveOperands* move = parallel_move->MoveOperandsAt(i);
     if (!move->IsRedundant()) {
-      moves_.Add(move);
+      moves_.push_back(move);
     }
   }
 }
 
 void ParallelMoveResolverWithSwap::EmitNativeCode(HParallelMove* parallel_move) {
-  DCHECK(moves_.IsEmpty());
+  DCHECK(moves_.empty());
   // Build up a worklist of moves.
   BuildInitialMoveList(parallel_move);
 
   // Move stack/stack slot to take advantage of a free register on constrained machines.
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    const MoveOperands& move = *moves_.Get(i);
+  for (size_t i = 0; i < moves_.size(); ++i) {
+    const MoveOperands& move = *moves_[i];
     // Ignore constants and moves already eliminated.
     if (move.IsEliminated() || move.GetSource().IsConstant()) {
       continue;
@@ -52,8 +54,8 @@
     }
   }
 
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    const MoveOperands& move = *moves_.Get(i);
+  for (size_t i = 0; i < moves_.size(); ++i) {
+    const MoveOperands& move = *moves_[i];
     // Skip constants to perform them last.  They don't block other moves
     // and skipping such moves with register destinations keeps those
     // registers free for the whole algorithm.
@@ -63,8 +65,8 @@
   }
 
   // Perform the moves with constant sources.
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    MoveOperands* move = moves_.Get(i);
+  for (size_t i = 0; i < moves_.size(); ++i) {
+    MoveOperands* move = moves_[i];
     if (!move->IsEliminated()) {
       DCHECK(move->GetSource().IsConstant());
       EmitMove(i);
@@ -73,7 +75,7 @@
     }
   }
 
-  moves_.Reset();
+  moves_.clear();
 }
 
 Location LowOf(Location location) {
@@ -123,7 +125,8 @@
   // which means that a call to PerformMove could change any source operand
   // in the move graph.
 
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   DCHECK(!move->IsPending());
   if (move->IsRedundant()) {
     // Because we swap register pairs first, following, un-pending
@@ -143,8 +146,8 @@
   // as this one's destination blocks this one so recursively perform all
   // such moves.
   MoveOperands* required_swap = nullptr;
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    const MoveOperands& other_move = *moves_.Get(i);
+  for (size_t i = 0; i < moves_.size(); ++i) {
+    const MoveOperands& other_move = *moves_[i];
     if (other_move.Blocks(destination) && !other_move.IsPending()) {
       // Though PerformMove can change any source operand in the move graph,
       // calling `PerformMove` cannot create a blocking move via a swap
@@ -163,7 +166,7 @@
         // at the next moves. Swapping is not blocked by anything, it just
         // updates other moves's source.
         break;
-      } else if (required_swap == moves_.Get(i)) {
+      } else if (required_swap == moves_[i]) {
         // If `other_move` was swapped, we iterate again to find a new
         // potential cycle.
         required_swap = nullptr;
@@ -171,7 +174,7 @@
       } else if (required_swap != nullptr) {
         // A move is required to swap. We walk back the cycle to find the
         // move by just returning from this `PerforrmMove`.
-        moves_.Get(index)->ClearPending(destination);
+        moves_[index]->ClearPending(destination);
         return required_swap;
       }
     }
@@ -197,14 +200,13 @@
     DCHECK_EQ(required_swap, move);
     do_swap = true;
   } else {
-    for (size_t i = 0; i < moves_.Size(); ++i) {
-      const MoveOperands& other_move = *moves_.Get(i);
-      if (other_move.Blocks(destination)) {
-        DCHECK(other_move.IsPending());
-        if (!move->Is64BitMove() && other_move.Is64BitMove()) {
+    for (MoveOperands* other_move : moves_) {
+      if (other_move->Blocks(destination)) {
+        DCHECK(other_move->IsPending());
+        if (!move->Is64BitMove() && other_move->Is64BitMove()) {
           // We swap 64bits moves before swapping 32bits moves. Go back from the
           // cycle by returning the move that must be swapped.
-          return moves_.Get(i);
+          return other_move;
         }
         do_swap = true;
         break;
@@ -220,12 +222,11 @@
     Location source = move->GetSource();
     Location swap_destination = move->GetDestination();
     move->Eliminate();
-    for (size_t i = 0; i < moves_.Size(); ++i) {
-      const MoveOperands& other_move = *moves_.Get(i);
-      if (other_move.Blocks(source)) {
-        UpdateSourceOf(moves_.Get(i), source, swap_destination);
-      } else if (other_move.Blocks(swap_destination)) {
-        UpdateSourceOf(moves_.Get(i), swap_destination, source);
+    for (MoveOperands* other_move : moves_) {
+      if (other_move->Blocks(source)) {
+        UpdateSourceOf(other_move, source, swap_destination);
+      } else if (other_move->Blocks(swap_destination)) {
+        UpdateSourceOf(other_move, swap_destination, source);
       }
     }
     // If the swap was required because of a 64bits move in the middle of a cycle,
@@ -242,14 +243,14 @@
 }
 
 bool ParallelMoveResolverWithSwap::IsScratchLocation(Location loc) {
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    if (moves_.Get(i)->Blocks(loc)) {
+  for (MoveOperands* move : moves_) {
+    if (move->Blocks(loc)) {
       return false;
     }
   }
 
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    if (moves_.Get(i)->GetDestination().Equals(loc)) {
+  for (MoveOperands* move : moves_) {
+    if (move->GetDestination().Equals(loc)) {
       return true;
     }
   }
@@ -302,8 +303,8 @@
 
 void ParallelMoveResolverNoSwap::EmitNativeCode(HParallelMove* parallel_move) {
   DCHECK_EQ(GetNumberOfPendingMoves(), 0u);
-  DCHECK(moves_.IsEmpty());
-  DCHECK(scratches_.IsEmpty());
+  DCHECK(moves_.empty());
+  DCHECK(scratches_.empty());
 
   // Backend dependent initialization.
   PrepareForEmitNativeCode();
@@ -311,8 +312,8 @@
   // Build up a worklist of moves.
   BuildInitialMoveList(parallel_move);
 
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    const MoveOperands& move = *moves_.Get(i);
+  for (size_t i = 0; i < moves_.size(); ++i) {
+    const MoveOperands& move = *moves_[i];
     // Skip constants to perform them last. They don't block other moves and
     // skipping such moves with register destinations keeps those registers
     // free for the whole algorithm.
@@ -324,8 +325,8 @@
   // Perform the moves with constant sources and register destinations with UpdateMoveSource()
   // to reduce the number of literal loads. Stack destinations are skipped since we won't be benefit
   // from changing the constant sources to stack locations.
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    MoveOperands* move = moves_.Get(i);
+  for (size_t i = 0; i < moves_.size(); ++i) {
+    MoveOperands* move = moves_[i];
     Location destination = move->GetDestination();
     if (!move->IsEliminated() && !destination.IsStackSlot() && !destination.IsDoubleStackSlot()) {
       Location source = move->GetSource();
@@ -344,8 +345,8 @@
   }
 
   // Perform the rest of the moves.
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    MoveOperands* move = moves_.Get(i);
+  for (size_t i = 0; i < moves_.size(); ++i) {
+    MoveOperands* move = moves_[i];
     if (!move->IsEliminated()) {
       EmitMove(i);
       move->Eliminate();
@@ -358,19 +359,18 @@
   // Backend dependent cleanup.
   FinishEmitNativeCode();
 
-  moves_.Reset();
-  scratches_.Reset();
+  moves_.clear();
+  scratches_.clear();
 }
 
 Location ParallelMoveResolverNoSwap::GetScratchLocation(Location::Kind kind) {
-  for (size_t i = 0; i < scratches_.Size(); ++i) {
-    Location loc = scratches_.Get(i);
+  for (Location loc : scratches_) {
     if (loc.GetKind() == kind && !IsBlockedByMoves(loc)) {
       return loc;
     }
   }
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    Location loc = moves_.Get(i)->GetDestination();
+  for (MoveOperands* move : moves_) {
+    Location loc = move->GetDestination();
     if (loc.GetKind() == kind && !IsBlockedByMoves(loc)) {
       return loc;
     }
@@ -380,18 +380,18 @@
 
 void ParallelMoveResolverNoSwap::AddScratchLocation(Location loc) {
   if (kIsDebugBuild) {
-    for (size_t i = 0; i < scratches_.Size(); ++i) {
-      DCHECK(!loc.Equals(scratches_.Get(i)));
+    for (Location scratch : scratches_) {
+      CHECK(!loc.Equals(scratch));
     }
   }
-  scratches_.Add(loc);
+  scratches_.push_back(loc);
 }
 
 void ParallelMoveResolverNoSwap::RemoveScratchLocation(Location loc) {
   DCHECK(!IsBlockedByMoves(loc));
-  for (size_t i = 0; i < scratches_.Size(); ++i) {
-    if (loc.Equals(scratches_.Get(i))) {
-      scratches_.DeleteAt(i);
+  for (auto it = scratches_.begin(), end = scratches_.end(); it != end; ++it) {
+    if (loc.Equals(*it)) {
+      scratches_.erase(it);
       break;
     }
   }
@@ -406,7 +406,8 @@
   // we will update source operand in the move graph to reduce dependencies in
   // the graph.
 
-  MoveOperands* move = moves_.Get(index);
+  DCHECK_LT(index, moves_.size());
+  MoveOperands* move = moves_[index];
   DCHECK(!move->IsPending());
   DCHECK(!move->IsEliminated());
   if (move->IsRedundant()) {
@@ -433,8 +434,8 @@
   // dependencies. Any unperformed, unpending move with a source the same
   // as this one's destination blocks this one so recursively perform all
   // such moves.
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    const MoveOperands& other_move = *moves_.Get(i);
+  for (size_t i = 0; i < moves_.size(); ++i) {
+    const MoveOperands& other_move = *moves_[i];
     if (other_move.Blocks(destination) && !other_move.IsPending()) {
       PerformMove(i);
     }
@@ -490,8 +491,11 @@
     move->Eliminate();
     UpdateMoveSource(pending_source, pending_destination);
     // Free any unblocked locations in the scratch location list.
-    for (size_t i = 0; i < scratches_.Size(); ++i) {
-      Location scratch = scratches_.Get(i);
+    // Note: Fetch size() on each iteration because scratches_ can be modified inside the loop.
+    // FIXME: If FreeScratchLocation() removes the location from scratches_,
+    // we skip the next location. This happens for arm64.
+    for (size_t i = 0; i < scratches_.size(); ++i) {
+      Location scratch = scratches_[i];
       // Only scratch overlapping with performed move source can be unblocked.
       if (scratch.OverlapsWith(pending_source) && !IsBlockedByMoves(scratch)) {
         FreeScratchLocation(pending_source);
@@ -512,8 +516,7 @@
   // This is not something we must do, but we can use fewer scratch locations with
   // this trick. For example, we can avoid using additional scratch locations for
   // moves (0 -> 1), (1 -> 2), (1 -> 0).
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    MoveOperands* move = moves_.Get(i);
+  for (MoveOperands* move : moves_) {
     if (move->GetSource().Equals(from)) {
       move->SetSource(to);
     }
@@ -522,16 +525,15 @@
 
 void ParallelMoveResolverNoSwap::AddPendingMove(Location source,
     Location destination, Primitive::Type type) {
-  pending_moves_.Add(new (allocator_) MoveOperands(source, destination, type, nullptr));
+  pending_moves_.push_back(new (allocator_) MoveOperands(source, destination, type, nullptr));
 }
 
 void ParallelMoveResolverNoSwap::DeletePendingMove(MoveOperands* move) {
-  pending_moves_.Delete(move);
+  RemoveElement(pending_moves_, move);
 }
 
 MoveOperands* ParallelMoveResolverNoSwap::GetUnblockedPendingMove(Location loc) {
-  for (size_t i = 0; i < pending_moves_.Size(); ++i) {
-    MoveOperands* move = pending_moves_.Get(i);
+  for (MoveOperands* move : pending_moves_) {
     Location destination = move->GetDestination();
     // Only moves with destination overlapping with input loc can be unblocked.
     if (destination.OverlapsWith(loc) && !IsBlockedByMoves(destination)) {
@@ -542,13 +544,13 @@
 }
 
 bool ParallelMoveResolverNoSwap::IsBlockedByMoves(Location loc) {
-  for (size_t i = 0; i < pending_moves_.Size(); ++i) {
-    if (pending_moves_.Get(i)->Blocks(loc)) {
+  for (MoveOperands* move : pending_moves_) {
+    if (move->Blocks(loc)) {
       return true;
     }
   }
-  for (size_t i = 0; i < moves_.Size(); ++i) {
-    if (moves_.Get(i)->Blocks(loc)) {
+  for (MoveOperands* move : moves_) {
+    if (move->Blocks(loc)) {
       return true;
     }
   }
@@ -558,7 +560,7 @@
 // So far it is only used for debugging purposes to make sure all pending moves
 // have been performed.
 size_t ParallelMoveResolverNoSwap::GetNumberOfPendingMoves() {
-  return pending_moves_.Size();
+  return pending_moves_.size();
 }
 
 }  // namespace art
diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h
index 9ede910..4278861 100644
--- a/compiler/optimizing/parallel_move_resolver.h
+++ b/compiler/optimizing/parallel_move_resolver.h
@@ -17,8 +17,8 @@
 #ifndef ART_COMPILER_OPTIMIZING_PARALLEL_MOVE_RESOLVER_H_
 #define ART_COMPILER_OPTIMIZING_PARALLEL_MOVE_RESOLVER_H_
 
+#include "base/arena_containers.h"
 #include "base/value_object.h"
-#include "utils/growable_array.h"
 #include "locations.h"
 #include "primitive.h"
 
@@ -31,7 +31,10 @@
 // have their own subclass that implements corresponding virtual functions.
 class ParallelMoveResolver : public ValueObject {
  public:
-  explicit ParallelMoveResolver(ArenaAllocator* allocator) : moves_(allocator, 32) {}
+  explicit ParallelMoveResolver(ArenaAllocator* allocator)
+      : moves_(allocator->Adapter(kArenaAllocParallelMoveResolver)) {
+    moves_.reserve(32);
+  }
   virtual ~ParallelMoveResolver() {}
 
   // Resolve a set of parallel moves, emitting assembler instructions.
@@ -41,7 +44,7 @@
   // Build the initial list of moves.
   void BuildInitialMoveList(HParallelMove* parallel_move);
 
-  GrowableArray<MoveOperands*> moves_;
+  ArenaVector<MoveOperands*> moves_;
 
  private:
   DISALLOW_COPY_AND_ASSIGN(ParallelMoveResolver);
@@ -120,8 +123,13 @@
 class ParallelMoveResolverNoSwap : public ParallelMoveResolver {
  public:
   explicit ParallelMoveResolverNoSwap(ArenaAllocator* allocator)
-      : ParallelMoveResolver(allocator), scratches_(allocator, 32),
-        pending_moves_(allocator, 8), allocator_(allocator) {}
+      : ParallelMoveResolver(allocator),
+        scratches_(allocator->Adapter(kArenaAllocParallelMoveResolver)),
+        pending_moves_(allocator->Adapter(kArenaAllocParallelMoveResolver)),
+        allocator_(allocator) {
+    scratches_.reserve(32);
+    pending_moves_.reserve(8);
+  }
   virtual ~ParallelMoveResolverNoSwap() {}
 
   // Resolve a set of parallel moves, emitting assembler instructions.
@@ -160,7 +168,7 @@
   void RemoveScratchLocation(Location loc);
 
   // List of scratch locations.
-  GrowableArray<Location> scratches_;
+  ArenaVector<Location> scratches_;
 
  private:
   // Perform the move at the given index in `moves_` (possibly requiring other moves to satisfy
@@ -183,7 +191,7 @@
   size_t GetNumberOfPendingMoves();
 
   // Additional pending moves which might be added to resolve dependency cycle.
-  GrowableArray<MoveOperands*> pending_moves_;
+  ArenaVector<MoveOperands*> pending_moves_;
 
   // Used to allocate pending MoveOperands.
   ArenaAllocator* const allocator_;
diff --git a/compiler/optimizing/parallel_move_test.cc b/compiler/optimizing/parallel_move_test.cc
index f8f7010..da91cb8 100644
--- a/compiler/optimizing/parallel_move_test.cc
+++ b/compiler/optimizing/parallel_move_test.cc
@@ -56,7 +56,8 @@
       : ParallelMoveResolverWithSwap(allocator) {}
 
   void EmitMove(size_t index) OVERRIDE {
-    MoveOperands* move = moves_.Get(index);
+    DCHECK_LT(index, moves_.size());
+    MoveOperands* move = moves_[index];
     if (!message_.str().empty()) {
       message_ << " ";
     }
@@ -68,7 +69,8 @@
   }
 
   void EmitSwap(size_t index) OVERRIDE {
-    MoveOperands* move = moves_.Get(index);
+    DCHECK_LT(index, moves_.size());
+    MoveOperands* move = moves_[index];
     if (!message_.str().empty()) {
       message_ << " ";
     }
@@ -127,7 +129,8 @@
   void FreeScratchLocation(Location loc ATTRIBUTE_UNUSED) OVERRIDE {}
 
   void EmitMove(size_t index) OVERRIDE {
-    MoveOperands* move = moves_.Get(index);
+    DCHECK_LT(index, moves_.size());
+    MoveOperands* move = moves_[index];
     if (!message_.str().empty()) {
       message_ << " ";
     }
diff --git a/compiler/optimizing/register_allocator.cc b/compiler/optimizing/register_allocator.cc
index c43e58f..9cdb89b 100644
--- a/compiler/optimizing/register_allocator.cc
+++ b/compiler/optimizing/register_allocator.cc
@@ -187,7 +187,8 @@
   }
 
   number_of_registers_ = codegen_->GetNumberOfCoreRegisters();
-  registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_);
+  registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_,
+                                                    kArenaAllocRegisterAllocator);
   processing_core_registers_ = true;
   unhandled_ = &unhandled_core_intervals_;
   for (LiveInterval* fixed : physical_core_register_intervals_) {
@@ -206,7 +207,8 @@
   handled_.clear();
 
   number_of_registers_ = codegen_->GetNumberOfFloatingPointRegisters();
-  registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_);
+  registers_array_ = allocator_->AllocArray<size_t>(number_of_registers_,
+                                                    kArenaAllocRegisterAllocator);
   processing_core_registers_ = false;
   unhandled_ = &unhandled_fp_intervals_;
   for (LiveInterval* fixed : physical_fp_register_intervals_) {
diff --git a/compiler/optimizing/ssa_builder.cc b/compiler/optimizing/ssa_builder.cc
index fb11d76..40c75af 100644
--- a/compiler/optimizing/ssa_builder.cc
+++ b/compiler/optimizing/ssa_builder.cc
@@ -56,6 +56,24 @@
   DISALLOW_COPY_AND_ASSIGN(DeadPhiHandling);
 };
 
+static bool HasConflictingEquivalent(HPhi* phi) {
+  if (phi->GetNext() == nullptr) {
+    return false;
+  }
+  HPhi* next = phi->GetNext()->AsPhi();
+  if (next->GetRegNumber() == phi->GetRegNumber()) {
+    if (next->GetType() == Primitive::kPrimVoid) {
+      // We only get a void type for an equivalent phi we processed and found out
+      // it was conflicting.
+      return true;
+    } else {
+      // Go to the next phi, in case it is also an equivalent.
+      return HasConflictingEquivalent(next);
+    }
+  }
+  return false;
+}
+
 bool DeadPhiHandling::UpdateType(HPhi* phi) {
   if (phi->IsDead()) {
     // Phi was rendered dead while waiting in the worklist because it was replaced
@@ -87,21 +105,26 @@
     if (new_type == Primitive::kPrimVoid) {
       new_type = input_type;
     } else if (new_type == Primitive::kPrimNot && input_type == Primitive::kPrimInt) {
+      if (input->IsPhi() && HasConflictingEquivalent(input->AsPhi())) {
+        // If we already asked for an equivalent of the input phi, but that equivalent
+        // ended up conflicting, make this phi conflicting too.
+        conflict = true;
+        break;
+      }
       HInstruction* equivalent = SsaBuilder::GetReferenceTypeEquivalent(input);
       if (equivalent == nullptr) {
         conflict = true;
         break;
-      } else {
-        phi->ReplaceInput(equivalent, i);
-        if (equivalent->IsPhi()) {
-          DCHECK_EQ(equivalent->GetType(), Primitive::kPrimNot);
-          // We created a new phi, but that phi has the same inputs as the old phi. We
-          // add it to the worklist to ensure its inputs can also be converted to reference.
-          // If not, it will remain dead, and the algorithm will make the current phi dead
-          // as well.
-          equivalent->AsPhi()->SetLive();
-          AddToWorklist(equivalent->AsPhi());
-        }
+      }
+      phi->ReplaceInput(equivalent, i);
+      if (equivalent->IsPhi()) {
+        DCHECK_EQ(equivalent->GetType(), Primitive::kPrimNot);
+        // We created a new phi, but that phi has the same inputs as the old phi. We
+        // add it to the worklist to ensure its inputs can also be converted to reference.
+        // If not, it will remain dead, and the algorithm will make the current phi dead
+        // as well.
+        equivalent->AsPhi()->SetLive();
+        AddToWorklist(equivalent->AsPhi());
       }
     } else if (new_type == Primitive::kPrimInt && input_type == Primitive::kPrimNot) {
       new_type = Primitive::kPrimNot;
diff --git a/compiler/optimizing/ssa_liveness_analysis.h b/compiler/optimizing/ssa_liveness_analysis.h
index 414cc7d..e4b0999 100644
--- a/compiler/optimizing/ssa_liveness_analysis.h
+++ b/compiler/optimizing/ssa_liveness_analysis.h
@@ -27,7 +27,7 @@
 
 static constexpr int kNoRegister = -1;
 
-class BlockInfo : public ArenaObject<kArenaAllocMisc> {
+class BlockInfo : public ArenaObject<kArenaAllocSsaLiveness> {
  public:
   BlockInfo(ArenaAllocator* allocator, const HBasicBlock& block, size_t number_of_ssa_values)
       : block_(block),
@@ -55,7 +55,7 @@
  * A live range contains the start and end of a range where an instruction or a temporary
  * is live.
  */
-class LiveRange FINAL : public ArenaObject<kArenaAllocMisc> {
+class LiveRange FINAL : public ArenaObject<kArenaAllocSsaLiveness> {
  public:
   LiveRange(size_t start, size_t end, LiveRange* next) : start_(start), end_(end), next_(next) {
     DCHECK_LT(start, end);
@@ -101,7 +101,7 @@
 /**
  * A use position represents a live interval use at a given position.
  */
-class UsePosition : public ArenaObject<kArenaAllocMisc> {
+class UsePosition : public ArenaObject<kArenaAllocSsaLiveness> {
  public:
   UsePosition(HInstruction* user,
               HEnvironment* environment,
@@ -169,7 +169,7 @@
   DISALLOW_COPY_AND_ASSIGN(UsePosition);
 };
 
-class SafepointPosition : public ArenaObject<kArenaAllocMisc> {
+class SafepointPosition : public ArenaObject<kArenaAllocSsaLiveness> {
  public:
   explicit SafepointPosition(HInstruction* instruction)
       : instruction_(instruction),
diff --git a/compiler/optimizing/stack_map_stream.cc b/compiler/optimizing/stack_map_stream.cc
index 1f0bac5..f27cecc 100644
--- a/compiler/optimizing/stack_map_stream.cc
+++ b/compiler/optimizing/stack_map_stream.cc
@@ -30,8 +30,8 @@
   current_entry_.sp_mask = sp_mask;
   current_entry_.num_dex_registers = num_dex_registers;
   current_entry_.inlining_depth = inlining_depth;
-  current_entry_.dex_register_locations_start_index = dex_register_locations_.Size();
-  current_entry_.inline_infos_start_index = inline_infos_.Size();
+  current_entry_.dex_register_locations_start_index = dex_register_locations_.size();
+  current_entry_.inline_infos_start_index = inline_infos_.size();
   current_entry_.dex_register_map_hash = 0;
   current_entry_.same_dex_register_map_as_ = kNoSameDexMapFound;
   if (num_dex_registers != 0) {
@@ -55,7 +55,7 @@
 
 void StackMapStream::EndStackMapEntry() {
   current_entry_.same_dex_register_map_as_ = FindEntryWithTheSameDexMap();
-  stack_maps_.Add(current_entry_);
+  stack_maps_.push_back(current_entry_);
   current_entry_ = StackMapEntry();
 }
 
@@ -73,12 +73,12 @@
     auto it = location_catalog_entries_indices_.Find(location);
     if (it != location_catalog_entries_indices_.end()) {
       // Retrieve the index from the hash map.
-      dex_register_locations_.Add(it->second);
+      dex_register_locations_.push_back(it->second);
     } else {
       // Create a new entry in the location catalog and the hash map.
-      size_t index = location_catalog_entries_.Size();
-      location_catalog_entries_.Add(location);
-      dex_register_locations_.Add(index);
+      size_t index = location_catalog_entries_.size();
+      location_catalog_entries_.push_back(location);
+      dex_register_locations_.push_back(index);
       location_catalog_entries_indices_.Insert(std::make_pair(location, index));
     }
 
@@ -108,7 +108,7 @@
   current_inline_info_.dex_pc = dex_pc;
   current_inline_info_.invoke_type = invoke_type;
   current_inline_info_.num_dex_registers = num_dex_registers;
-  current_inline_info_.dex_register_locations_start_index = dex_register_locations_.Size();
+  current_inline_info_.dex_register_locations_start_index = dex_register_locations_.size();
   if (num_dex_registers != 0) {
     current_inline_info_.live_dex_registers_mask =
         new (allocator_) ArenaBitVector(allocator_, num_dex_registers, true);
@@ -123,14 +123,14 @@
   DCHECK_EQ(current_dex_register_, current_inline_info_.num_dex_registers)
       << "Inline information contains less registers than expected";
   in_inline_frame_ = false;
-  inline_infos_.Add(current_inline_info_);
+  inline_infos_.push_back(current_inline_info_);
   current_inline_info_ = InlineInfoEntry();
 }
 
 uint32_t StackMapStream::ComputeMaxNativePcOffset() const {
   uint32_t max_native_pc_offset = 0u;
-  for (size_t i = 0, size = stack_maps_.Size(); i != size; ++i) {
-    max_native_pc_offset = std::max(max_native_pc_offset, stack_maps_.Get(i).native_pc_offset);
+  for (const StackMapEntry& entry : stack_maps_) {
+    max_native_pc_offset = std::max(max_native_pc_offset, entry.native_pc_offset);
   }
   return max_native_pc_offset;
 }
@@ -147,7 +147,7 @@
                                                           dex_pc_max_,
                                                           max_native_pc_offset,
                                                           register_mask_max_);
-  stack_maps_size_ = stack_maps_.Size() * stack_map_encoding_.ComputeStackMapSize();
+  stack_maps_size_ = stack_maps_.size() * stack_map_encoding_.ComputeStackMapSize();
   dex_register_location_catalog_size_ = ComputeDexRegisterLocationCatalogSize();
 
   // Note: use RoundUp to word-size here if you want CodeInfo objects to be word aligned.
@@ -170,33 +170,28 @@
 
 size_t StackMapStream::ComputeDexRegisterLocationCatalogSize() const {
   size_t size = DexRegisterLocationCatalog::kFixedSize;
-  for (size_t location_catalog_entry_index = 0;
-       location_catalog_entry_index < location_catalog_entries_.Size();
-       ++location_catalog_entry_index) {
-    DexRegisterLocation dex_register_location =
-        location_catalog_entries_.Get(location_catalog_entry_index);
+  for (const DexRegisterLocation& dex_register_location : location_catalog_entries_) {
     size += DexRegisterLocationCatalog::EntrySize(dex_register_location);
   }
   return size;
 }
 
 size_t StackMapStream::ComputeDexRegisterMapSize(uint32_t num_dex_registers,
-                                                 const BitVector& live_dex_registers_mask) const {
+                                                 const BitVector* live_dex_registers_mask) const {
+  // For num_dex_registers == 0u live_dex_registers_mask may be null.
+  if (num_dex_registers == 0u) {
+    return 0u;  // No register map will be emitted.
+  }
+  DCHECK(live_dex_registers_mask != nullptr);
+
   // Size of the map in bytes.
   size_t size = DexRegisterMap::kFixedSize;
   // Add the live bit mask for the Dex register liveness.
   size += DexRegisterMap::GetLiveBitMaskSize(num_dex_registers);
   // Compute the size of the set of live Dex register entries.
-  size_t number_of_live_dex_registers = 0;
-  for (size_t dex_register_number = 0;
-       dex_register_number < num_dex_registers;
-       ++dex_register_number) {
-    if (live_dex_registers_mask.IsBitSet(dex_register_number)) {
-      ++number_of_live_dex_registers;
-    }
-  }
+  size_t number_of_live_dex_registers = live_dex_registers_mask->NumSetBits();
   size_t map_entries_size_in_bits =
-      DexRegisterMap::SingleEntrySizeInBits(location_catalog_entries_.Size())
+      DexRegisterMap::SingleEntrySizeInBits(location_catalog_entries_.size())
       * number_of_live_dex_registers;
   size_t map_entries_size_in_bytes =
       RoundUp(map_entries_size_in_bits, kBitsPerByte) / kBitsPerByte;
@@ -207,24 +202,24 @@
 size_t StackMapStream::ComputeDexRegisterMapsSize() const {
   size_t size = 0;
   size_t inline_info_index = 0;
-  for (size_t i = 0; i < stack_maps_.Size(); ++i) {
-    StackMapEntry entry = stack_maps_.Get(i);
+  for (const StackMapEntry& entry : stack_maps_) {
     if (entry.same_dex_register_map_as_ == kNoSameDexMapFound) {
-      size += ComputeDexRegisterMapSize(entry.num_dex_registers, *entry.live_dex_registers_mask);
+      size += ComputeDexRegisterMapSize(entry.num_dex_registers, entry.live_dex_registers_mask);
     } else {
       // Entries with the same dex map will have the same offset.
     }
     for (size_t j = 0; j < entry.inlining_depth; ++j) {
-      InlineInfoEntry inline_entry = inline_infos_.Get(inline_info_index++);
+      DCHECK_LT(inline_info_index, inline_infos_.size());
+      InlineInfoEntry inline_entry = inline_infos_[inline_info_index++];
       size += ComputeDexRegisterMapSize(inline_entry.num_dex_registers,
-                                        *inline_entry.live_dex_registers_mask);
+                                        inline_entry.live_dex_registers_mask);
     }
   }
   return size;
 }
 
 size_t StackMapStream::ComputeInlineInfoSize() const {
-  return inline_infos_.Size() * InlineInfo::SingleEntrySize()
+  return inline_infos_.size() * InlineInfo::SingleEntrySize()
     // For encoding the depth.
     + (number_of_stack_maps_with_inline_info_ * InlineInfo::kFixedSize);
 }
@@ -244,19 +239,18 @@
       inline_infos_start_, inline_info_size_);
 
   code_info.SetEncoding(stack_map_encoding_);
-  code_info.SetNumberOfStackMaps(stack_maps_.Size());
+  code_info.SetNumberOfStackMaps(stack_maps_.size());
   DCHECK_EQ(code_info.GetStackMapsSize(code_info.ExtractEncoding()), stack_maps_size_);
 
   // Set the Dex register location catalog.
-  code_info.SetNumberOfLocationCatalogEntries(location_catalog_entries_.Size());
+  code_info.SetNumberOfLocationCatalogEntries(location_catalog_entries_.size());
   MemoryRegion dex_register_location_catalog_region = region.Subregion(
       dex_register_location_catalog_start_, dex_register_location_catalog_size_);
   DexRegisterLocationCatalog dex_register_location_catalog(dex_register_location_catalog_region);
   // Offset in `dex_register_location_catalog` where to store the next
   // register location.
   size_t location_catalog_offset = DexRegisterLocationCatalog::kFixedSize;
-  for (size_t i = 0, e = location_catalog_entries_.Size(); i < e; ++i) {
-    DexRegisterLocation dex_register_location = location_catalog_entries_.Get(i);
+  for (DexRegisterLocation dex_register_location : location_catalog_entries_) {
     dex_register_location_catalog.SetRegisterInfo(location_catalog_offset, dex_register_location);
     location_catalog_offset += DexRegisterLocationCatalog::EntrySize(dex_register_location);
   }
@@ -265,9 +259,9 @@
 
   uintptr_t next_dex_register_map_offset = 0;
   uintptr_t next_inline_info_offset = 0;
-  for (size_t i = 0, e = stack_maps_.Size(); i < e; ++i) {
+  for (size_t i = 0, e = stack_maps_.size(); i < e; ++i) {
     StackMap stack_map = code_info.GetStackMapAt(i, stack_map_encoding_);
-    StackMapEntry entry = stack_maps_.Get(i);
+    StackMapEntry entry = stack_maps_[i];
 
     stack_map.SetDexPc(stack_map_encoding_, entry.dex_pc);
     stack_map.SetNativePcOffset(stack_map_encoding_, entry.native_pc_offset);
@@ -291,7 +285,7 @@
         // New dex registers maps should be added to the stack map.
         MemoryRegion register_region = dex_register_locations_region.Subregion(
             next_dex_register_map_offset,
-            ComputeDexRegisterMapSize(entry.num_dex_registers, *entry.live_dex_registers_mask));
+            ComputeDexRegisterMapSize(entry.num_dex_registers, entry.live_dex_registers_mask));
         next_dex_register_map_offset += register_region.size();
         DexRegisterMap dex_register_map(register_region);
         stack_map.SetDexRegisterMapOffset(
@@ -318,8 +312,9 @@
           stack_map_encoding_, inline_region.start() - dex_register_locations_region.start());
 
       inline_info.SetDepth(entry.inlining_depth);
+      DCHECK_LE(entry.inline_infos_start_index + entry.inlining_depth, inline_infos_.size());
       for (size_t depth = 0; depth < entry.inlining_depth; ++depth) {
-        InlineInfoEntry inline_entry = inline_infos_.Get(depth + entry.inline_infos_start_index);
+        InlineInfoEntry inline_entry = inline_infos_[depth + entry.inline_infos_start_index];
         inline_info.SetMethodIndexAtDepth(depth, inline_entry.method_index);
         inline_info.SetDexPcAtDepth(depth, inline_entry.dex_pc);
         inline_info.SetInvokeTypeAtDepth(depth, inline_entry.invoke_type);
@@ -331,7 +326,7 @@
           MemoryRegion register_region = dex_register_locations_region.Subregion(
               next_dex_register_map_offset,
               ComputeDexRegisterMapSize(inline_entry.num_dex_registers,
-                                        *inline_entry.live_dex_registers_mask));
+                                        inline_entry.live_dex_registers_mask));
           next_dex_register_map_offset += register_region.size();
           DexRegisterMap dex_register_map(register_region);
           inline_info.SetDexRegisterMapOffsetAtDepth(
@@ -357,42 +352,43 @@
                                           uint32_t start_index_in_dex_register_locations) const {
   dex_register_map.SetLiveBitMask(num_dex_registers, live_dex_registers_mask);
   // Set the dex register location mapping data.
-  for (size_t dex_register_number = 0, index_in_dex_register_locations = 0;
-       dex_register_number < num_dex_registers;
-       ++dex_register_number) {
-    if (live_dex_registers_mask.IsBitSet(dex_register_number)) {
-      size_t location_catalog_entry_index = dex_register_locations_.Get(
-          start_index_in_dex_register_locations + index_in_dex_register_locations);
-      dex_register_map.SetLocationCatalogEntryIndex(
-          index_in_dex_register_locations,
-          location_catalog_entry_index,
-          num_dex_registers,
-          location_catalog_entries_.Size());
-      ++index_in_dex_register_locations;
-    }
+  size_t number_of_live_dex_registers = live_dex_registers_mask.NumSetBits();
+  DCHECK_LE(number_of_live_dex_registers, dex_register_locations_.size());
+  DCHECK_LE(start_index_in_dex_register_locations,
+            dex_register_locations_.size() - number_of_live_dex_registers);
+  for (size_t index_in_dex_register_locations = 0;
+      index_in_dex_register_locations != number_of_live_dex_registers;
+       ++index_in_dex_register_locations) {
+    size_t location_catalog_entry_index = dex_register_locations_[
+        start_index_in_dex_register_locations + index_in_dex_register_locations];
+    dex_register_map.SetLocationCatalogEntryIndex(
+        index_in_dex_register_locations,
+        location_catalog_entry_index,
+        num_dex_registers,
+        location_catalog_entries_.size());
   }
 }
 
 size_t StackMapStream::FindEntryWithTheSameDexMap() {
-  size_t current_entry_index = stack_maps_.Size();
+  size_t current_entry_index = stack_maps_.size();
   auto entries_it = dex_map_hash_to_stack_map_indices_.find(current_entry_.dex_register_map_hash);
   if (entries_it == dex_map_hash_to_stack_map_indices_.end()) {
     // We don't have a perfect hash functions so we need a list to collect all stack maps
     // which might have the same dex register map.
-    GrowableArray<uint32_t> stack_map_indices(allocator_, 1);
-    stack_map_indices.Add(current_entry_index);
-    dex_map_hash_to_stack_map_indices_.Put(current_entry_.dex_register_map_hash, stack_map_indices);
+    ArenaVector<uint32_t> stack_map_indices(allocator_->Adapter(kArenaAllocStackMapStream));
+    stack_map_indices.push_back(current_entry_index);
+    dex_map_hash_to_stack_map_indices_.Put(current_entry_.dex_register_map_hash,
+                                           std::move(stack_map_indices));
     return kNoSameDexMapFound;
   }
 
   // We might have collisions, so we need to check whether or not we really have a match.
-  for (size_t i = 0; i < entries_it->second.Size(); i++) {
-    size_t test_entry_index = entries_it->second.Get(i);
-    if (HaveTheSameDexMaps(stack_maps_.Get(test_entry_index), current_entry_)) {
+  for (uint32_t test_entry_index : entries_it->second) {
+    if (HaveTheSameDexMaps(GetStackMap(test_entry_index), current_entry_)) {
       return test_entry_index;
     }
   }
-  entries_it->second.Add(current_entry_index);
+  entries_it->second.push_back(current_entry_index);
   return kNoSameDexMapFound;
 }
 
@@ -406,21 +402,22 @@
   if (a.num_dex_registers != b.num_dex_registers) {
     return false;
   }
-
-  int index_in_dex_register_locations = 0;
-  for (uint32_t i = 0; i < a.num_dex_registers; i++) {
-    if (a.live_dex_registers_mask->IsBitSet(i) != b.live_dex_registers_mask->IsBitSet(i)) {
+  if (a.num_dex_registers != 0u) {
+    DCHECK(a.live_dex_registers_mask != nullptr);
+    DCHECK(b.live_dex_registers_mask != nullptr);
+    if (!a.live_dex_registers_mask->Equal(b.live_dex_registers_mask)) {
       return false;
     }
-    if (a.live_dex_registers_mask->IsBitSet(i)) {
-      size_t a_loc = dex_register_locations_.Get(
-          a.dex_register_locations_start_index + index_in_dex_register_locations);
-      size_t b_loc = dex_register_locations_.Get(
-          b.dex_register_locations_start_index + index_in_dex_register_locations);
-      if (a_loc != b_loc) {
-        return false;
-      }
-      ++index_in_dex_register_locations;
+    size_t number_of_live_dex_registers = a.live_dex_registers_mask->NumSetBits();
+    DCHECK_LE(number_of_live_dex_registers, dex_register_locations_.size());
+    DCHECK_LE(a.dex_register_locations_start_index,
+              dex_register_locations_.size() - number_of_live_dex_registers);
+    DCHECK_LE(b.dex_register_locations_start_index,
+              dex_register_locations_.size() - number_of_live_dex_registers);
+    auto a_begin = dex_register_locations_.begin() + a.dex_register_locations_start_index;
+    auto b_begin = dex_register_locations_.begin() + b.dex_register_locations_start_index;
+    if (!std::equal(a_begin, a_begin + number_of_live_dex_registers, b_begin)) {
+      return false;
     }
   }
   return true;
diff --git a/compiler/optimizing/stack_map_stream.h b/compiler/optimizing/stack_map_stream.h
index 703b6f7..4783e28 100644
--- a/compiler/optimizing/stack_map_stream.h
+++ b/compiler/optimizing/stack_map_stream.h
@@ -24,7 +24,6 @@
 #include "memory_region.h"
 #include "nodes.h"
 #include "stack_map.h"
-#include "utils/growable_array.h"
 
 namespace art {
 
@@ -62,15 +61,16 @@
  public:
   explicit StackMapStream(ArenaAllocator* allocator)
       : allocator_(allocator),
-        stack_maps_(allocator, 10),
-        location_catalog_entries_(allocator, 4),
-        dex_register_locations_(allocator, 10 * 4),
-        inline_infos_(allocator, 2),
+        stack_maps_(allocator->Adapter(kArenaAllocStackMapStream)),
+        location_catalog_entries_(allocator->Adapter(kArenaAllocStackMapStream)),
+        dex_register_locations_(allocator->Adapter(kArenaAllocStackMapStream)),
+        inline_infos_(allocator->Adapter(kArenaAllocStackMapStream)),
         stack_mask_max_(-1),
         dex_pc_max_(0),
         register_mask_max_(0),
         number_of_stack_maps_with_inline_info_(0),
-        dex_map_hash_to_stack_map_indices_(std::less<uint32_t>(), allocator->Adapter()),
+        dex_map_hash_to_stack_map_indices_(std::less<uint32_t>(),
+                                           allocator->Adapter(kArenaAllocStackMapStream)),
         current_entry_(),
         current_inline_info_(),
         stack_mask_size_(0),
@@ -84,7 +84,12 @@
         inline_infos_start_(0),
         needed_size_(0),
         current_dex_register_(0),
-        in_inline_frame_(false) {}
+        in_inline_frame_(false) {
+    stack_maps_.reserve(10);
+    location_catalog_entries_.reserve(4);
+    dex_register_locations_.reserve(10 * 4);
+    inline_infos_.reserve(2);
+  }
 
   // See runtime/stack_map.h to know what these fields contain.
   struct StackMapEntry {
@@ -127,17 +132,17 @@
   void EndInlineInfoEntry();
 
   size_t GetNumberOfStackMaps() const {
-    return stack_maps_.Size();
+    return stack_maps_.size();
   }
 
   const StackMapEntry& GetStackMap(size_t i) const {
-    DCHECK_LT(i, stack_maps_.Size());
-    return stack_maps_.GetRawStorage()[i];
+    DCHECK_LT(i, stack_maps_.size());
+    return stack_maps_[i];
   }
 
   void SetStackMapNativePcOffset(size_t i, uint32_t native_pc_offset) {
-    DCHECK_LT(i, stack_maps_.Size());
-    stack_maps_.GetRawStorage()[i].native_pc_offset = native_pc_offset;
+    DCHECK_LT(i, stack_maps_.size());
+    stack_maps_[i].native_pc_offset = native_pc_offset;
   }
 
   uint32_t ComputeMaxNativePcOffset() const;
@@ -150,7 +155,7 @@
  private:
   size_t ComputeDexRegisterLocationCatalogSize() const;
   size_t ComputeDexRegisterMapSize(uint32_t num_dex_registers,
-                                   const BitVector& live_dex_registers_mask) const;
+                                   const BitVector* live_dex_registers_mask) const;
   size_t ComputeDexRegisterMapsSize() const;
   size_t ComputeInlineInfoSize() const;
 
@@ -164,10 +169,10 @@
                             uint32_t start_index_in_dex_register_locations) const;
 
   ArenaAllocator* allocator_;
-  GrowableArray<StackMapEntry> stack_maps_;
+  ArenaVector<StackMapEntry> stack_maps_;
 
   // A catalog of unique [location_kind, register_value] pairs (per method).
-  GrowableArray<DexRegisterLocation> location_catalog_entries_;
+  ArenaVector<DexRegisterLocation> location_catalog_entries_;
   // Map from Dex register location catalog entries to their indices in the
   // location catalog.
   typedef HashMap<DexRegisterLocation, size_t, LocationCatalogEntriesIndicesEmptyFn,
@@ -175,14 +180,14 @@
   LocationCatalogEntriesIndices location_catalog_entries_indices_;
 
   // A set of concatenated maps of Dex register locations indices to `location_catalog_entries_`.
-  GrowableArray<size_t> dex_register_locations_;
-  GrowableArray<InlineInfoEntry> inline_infos_;
+  ArenaVector<size_t> dex_register_locations_;
+  ArenaVector<InlineInfoEntry> inline_infos_;
   int stack_mask_max_;
   uint32_t dex_pc_max_;
   uint32_t register_mask_max_;
   size_t number_of_stack_maps_with_inline_info_;
 
-  ArenaSafeMap<uint32_t, GrowableArray<uint32_t>> dex_map_hash_to_stack_map_indices_;
+  ArenaSafeMap<uint32_t, ArenaVector<uint32_t>> dex_map_hash_to_stack_map_indices_;
 
   StackMapEntry current_entry_;
   InlineInfoEntry current_inline_info_;
diff --git a/compiler/utils/growable_array.h b/compiler/utils/growable_array.h
deleted file mode 100644
index f85e026..0000000
--- a/compiler/utils/growable_array.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Copyright (C) 2013 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef ART_COMPILER_UTILS_GROWABLE_ARRAY_H_
-#define ART_COMPILER_UTILS_GROWABLE_ARRAY_H_
-
-#include <stdint.h>
-#include <stddef.h>
-
-#include "base/arena_object.h"
-
-namespace art {
-
-// Deprecated
-// TODO: Replace all uses with ArenaVector<T>.
-template<typename T>
-class GrowableArray : public ArenaObject<kArenaAllocGrowableArray> {
-  public:
-    GrowableArray(ArenaAllocator* arena, size_t init_length)
-      : arena_(arena),
-        num_allocated_(init_length),
-        num_used_(0) {
-      elem_list_ = arena_->AllocArray<T>(init_length, kArenaAllocGrowableArray);
-    }
-
-    GrowableArray(ArenaAllocator* arena, size_t init_length, T initial_data)
-      : arena_(arena),
-        num_allocated_(init_length),
-        num_used_(init_length) {
-      elem_list_ = arena_->AllocArray<T>(init_length, kArenaAllocGrowableArray);
-      for (size_t i = 0; i < init_length; ++i) {
-        elem_list_[i] = initial_data;
-      }
-    }
-
-    bool Contains(T value, size_t start_from = 0) const {
-      for (size_t i = start_from; i < num_used_; ++i) {
-        if (elem_list_[i] == value) {
-          return true;
-        }
-      }
-      return false;
-    }
-
-    // Expand the list size to at least new length.
-    void Resize(size_t new_length) {
-      if (new_length <= num_allocated_) return;
-      // If it's a small list double the size, else grow 1.5x.
-      size_t target_length =
-          (num_allocated_ < 128) ? num_allocated_ << 1 : num_allocated_ + (num_allocated_ >> 1);
-      if (new_length > target_length) {
-         target_length = new_length;
-      }
-      T* new_array = arena_->AllocArray<T>(target_length, kArenaAllocGrowableArray);
-      memcpy(new_array, elem_list_, sizeof(T) * num_allocated_);
-      num_allocated_ = target_length;
-      elem_list_ = new_array;
-    }
-
-    // NOTE: does not return storage, just resets use count.
-    void Reset() {
-      num_used_ = 0;
-    }
-
-    // Insert an element to the end of a list, resizing if necessary.
-    void Insert(T elem) {
-      if (num_used_ == num_allocated_) {
-        Resize(num_used_ + 1);
-      }
-      elem_list_[num_used_++] = elem;
-    }
-
-    void InsertAt(size_t index, T elem) {
-      DCHECK(index <= Size());
-      Insert(elem);
-      for (size_t i = Size() - 1; i > index; --i) {
-        elem_list_[i] = elem_list_[i - 1];
-      }
-      elem_list_[index] = elem;
-    }
-
-    void Add(T elem) {
-      Insert(elem);
-    }
-
-    T Get(size_t index) const {
-      DCHECK_LT(index, num_used_);
-      return elem_list_[index];
-    }
-
-    // Overwrite existing element at position index.  List must be large enough.
-    void Put(size_t index, T elem) {
-      DCHECK_LT(index, num_used_);
-      elem_list_[index] = elem;
-    }
-
-    void Increment(size_t index) {
-      DCHECK_LT(index, num_used_);
-      elem_list_[index]++;
-    }
-
-    /*
-     * Remove an existing element from list.  If there are more than one copy
-     * of the element, only the first one encountered will be deleted.
-     */
-    // TODO: consider renaming this.
-    void Delete(T element) {
-      bool found = false;
-      for (size_t i = 0; i < num_used_ - 1; i++) {
-        if (!found && elem_list_[i] == element) {
-          found = true;
-        }
-        if (found) {
-          elem_list_[i] = elem_list_[i+1];
-        }
-      }
-      // We should either have found the element, or it was the last (unscanned) element.
-      DCHECK(found || (element == elem_list_[num_used_ - 1]));
-      num_used_--;
-    }
-
-    void DeleteAt(size_t index) {
-      for (size_t i = index; i < num_used_ - 1; i++) {
-        elem_list_[i] = elem_list_[i + 1];
-      }
-      num_used_--;
-    }
-
-    size_t GetNumAllocated() const { return num_allocated_; }
-
-    size_t Size() const { return num_used_; }
-
-    bool IsEmpty() const { return num_used_ == 0; }
-
-    T Pop() {
-      DCHECK_GE(num_used_, (size_t)0);
-      return elem_list_[--num_used_];
-    }
-
-    T Peek() const {
-      DCHECK_GE(num_used_, (size_t)0);
-      return elem_list_[num_used_ - 1];
-    }
-
-    void SetSize(size_t new_size) {
-      Resize(new_size);
-      num_used_ = new_size;
-    }
-
-    T* GetRawStorage() const { return elem_list_; }
-
-  private:
-    ArenaAllocator* const arena_;
-    size_t num_allocated_;
-    size_t num_used_;
-    T* elem_list_;
-};
-
-}  // namespace art
-
-#endif  // ART_COMPILER_UTILS_GROWABLE_ARRAY_H_
diff --git a/runtime/arch/mips/memcmp16_mips.S b/runtime/arch/mips/memcmp16_mips.S
index aef81af..c8eac9b 100644
--- a/runtime/arch/mips/memcmp16_mips.S
+++ b/runtime/arch/mips/memcmp16_mips.S
@@ -26,7 +26,7 @@
   beqz  $a2,done   /* 0 length string */
   beq $a0,$a1,done    /* strings are identical */
 
-  /* Unoptimised... */
+  /* Unoptimized... */
 1:  lhu $t0,0($a0)
   lhu $t1,0($a1)
   addu  $a1,2
diff --git a/runtime/base/arena_allocator.cc b/runtime/base/arena_allocator.cc
index c1a1088..345428c 100644
--- a/runtime/base/arena_allocator.cc
+++ b/runtime/base/arena_allocator.cc
@@ -83,12 +83,17 @@
   "BaselineMaps ",
   "Optimization ",
   "GVN          ",
+  "InductionVar ",
+  "BCE          ",
   "SsaLiveness  ",
   "SsaPhiElim   ",
   "RefTypeProp  ",
   "PrimTypeProp ",
   "SideEffects  ",
   "RegAllocator ",
+  "StackMapStm  ",
+  "CodeGen      ",
+  "ParallelMove ",
 };
 
 template <bool kCount>
diff --git a/runtime/base/arena_allocator.h b/runtime/base/arena_allocator.h
index be96862..b4f19ee 100644
--- a/runtime/base/arena_allocator.h
+++ b/runtime/base/arena_allocator.h
@@ -93,12 +93,17 @@
   kArenaAllocBaselineMaps,
   kArenaAllocOptimization,
   kArenaAllocGvn,
+  kArenaAllocInductionVarAnalysis,
+  kArenaAllocBoundsCheckElimination,
   kArenaAllocSsaLiveness,
   kArenaAllocSsaPhiElimination,
   kArenaAllocReferenceTypePropagation,
   kArenaAllocPrimitiveTypePropagation,
   kArenaAllocSideEffectsAnalysis,
   kArenaAllocRegisterAllocator,
+  kArenaAllocStackMapStream,
+  kArenaAllocCodeGenerator,
+  kArenaAllocParallelMoveResolver,
   kNumArenaAllocKinds
 };
 
diff --git a/runtime/base/arena_object.h b/runtime/base/arena_object.h
index ab97d0c..56e35d8 100644
--- a/runtime/base/arena_object.h
+++ b/runtime/base/arena_object.h
@@ -40,6 +40,10 @@
     LOG(FATAL) << "UNREACHABLE";
     UNREACHABLE();
   }
+
+  // NOTE: Providing placement new (and matching delete) for constructing container elements.
+  ALWAYS_INLINE void* operator new(size_t, void* ptr) noexcept { return ptr; }
+  ALWAYS_INLINE void operator delete(void*, void*) noexcept { }
 };
 
 
diff --git a/runtime/class_linker.cc b/runtime/class_linker.cc
index 8e42040..dbc5cec 100644
--- a/runtime/class_linker.cc
+++ b/runtime/class_linker.cc
@@ -1500,7 +1500,7 @@
   Thread* const self = Thread::Current();
   JavaVMExt* const vm = Runtime::Current()->GetJavaVM();
   for (const ClassLoaderData& data : class_loaders_) {
-    vm->DecodeWeakGlobalDuringShutdown(self, data.weak_root);
+    vm->DeleteWeakGlobalRef(self, data.weak_root);
     delete data.allocator;
     delete data.class_table;
   }
@@ -4186,6 +4186,8 @@
     data.allocator = Runtime::Current()->CreateLinearAlloc();
     class_loaders_.push_back(data);
     // Don't already have a class table, add it to the class loader.
+    CHECK(class_loader->GetClassTable() == nullptr);
+    CHECK(class_loader->GetAllocator() == nullptr);
     class_loader->SetClassTable(data.class_table);
     class_loader->SetAllocator(data.allocator);
   }
diff --git a/runtime/class_linker.h b/runtime/class_linker.h
index f705330..739403f 100644
--- a/runtime/class_linker.h
+++ b/runtime/class_linker.h
@@ -560,7 +560,7 @@
 
  private:
   struct ClassLoaderData {
-    jobject weak_root;  // Weak root to enable class unloading.
+    jweak weak_root;  // Weak root to enable class unloading.
     ClassTable* class_table;
     LinearAlloc* allocator;
   };
diff --git a/runtime/entrypoints/entrypoint_utils-inl.h b/runtime/entrypoints/entrypoint_utils-inl.h
index cc3eefe..8ae0b07 100644
--- a/runtime/entrypoints/entrypoint_utils-inl.h
+++ b/runtime/entrypoints/entrypoint_utils-inl.h
@@ -71,44 +71,6 @@
       *outer_method->GetDexFile(), method_index, dex_cache, class_loader, nullptr, invoke_type);
 }
 
-inline ArtMethod* GetCalleeSaveMethodCaller(ArtMethod** sp,
-                                            Runtime::CalleeSaveType type,
-                                            bool do_caller_check = false)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
-  DCHECK_EQ(*sp, Runtime::Current()->GetCalleeSaveMethod(type));
-
-  const size_t callee_frame_size = GetCalleeSaveFrameSize(kRuntimeISA, type);
-  auto** caller_sp = reinterpret_cast<ArtMethod**>(
-      reinterpret_cast<uintptr_t>(sp) + callee_frame_size);
-  ArtMethod* outer_method = *caller_sp;
-  ArtMethod* caller = outer_method;
-
-  if ((outer_method != nullptr) && outer_method->IsOptimized(sizeof(void*))) {
-    const size_t callee_return_pc_offset = GetCalleeSaveReturnPcOffset(kRuntimeISA, type);
-    uintptr_t caller_pc = *reinterpret_cast<uintptr_t*>(
-        (reinterpret_cast<uint8_t*>(sp) + callee_return_pc_offset));
-    uintptr_t native_pc_offset = outer_method->NativeQuickPcOffset(caller_pc);
-    CodeInfo code_info = outer_method->GetOptimizedCodeInfo();
-    StackMapEncoding encoding = code_info.ExtractEncoding();
-    StackMap stack_map = code_info.GetStackMapForNativePcOffset(native_pc_offset, encoding);
-    DCHECK(stack_map.IsValid());
-    if (stack_map.HasInlineInfo(encoding)) {
-      InlineInfo inline_info = code_info.GetInlineInfoOf(stack_map, encoding);
-      caller = GetResolvedMethod(outer_method, inline_info, inline_info.GetDepth() - 1);
-    }
-  }
-
-  if (kIsDebugBuild && do_caller_check) {
-    // Note that do_caller_check is optional, as this method can be called by
-    // stubs, and tests without a proper call stack.
-    NthCallerVisitor visitor(Thread::Current(), 1, true);
-    visitor.WalkStack();
-    CHECK_EQ(caller, visitor.caller);
-  }
-
-  return caller;
-}
-
 inline ArtMethod* GetCalleeSaveMethodCaller(Thread* self, Runtime::CalleeSaveType type)
     SHARED_REQUIRES(Locks::mutator_lock_) {
   return GetCalleeSaveMethodCaller(
diff --git a/runtime/entrypoints/entrypoint_utils.cc b/runtime/entrypoints/entrypoint_utils.cc
index 94aced2..f193999 100644
--- a/runtime/entrypoints/entrypoint_utils.cc
+++ b/runtime/entrypoints/entrypoint_utils.cc
@@ -21,11 +21,15 @@
 #include "base/mutex.h"
 #include "class_linker-inl.h"
 #include "dex_file-inl.h"
+#include "entrypoints/entrypoint_utils-inl.h"
+#include "entrypoints/quick/callee_save_frame.h"
+#include "entrypoints/runtime_asm_entrypoints.h"
 #include "gc/accounting/card_table-inl.h"
 #include "mirror/class-inl.h"
 #include "mirror/method.h"
 #include "mirror/object-inl.h"
 #include "mirror/object_array-inl.h"
+#include "nth_caller_visitor.h"
 #include "reflection.h"
 #include "scoped_thread_state_change.h"
 #include "ScopedLocalRef.h"
@@ -345,4 +349,54 @@
   return true;
 }
 
+ArtMethod* GetCalleeSaveMethodCaller(ArtMethod** sp,
+                                     Runtime::CalleeSaveType type,
+                                     bool do_caller_check)
+    SHARED_REQUIRES(Locks::mutator_lock_) {
+  DCHECK_EQ(*sp, Runtime::Current()->GetCalleeSaveMethod(type));
+
+  const size_t callee_frame_size = GetCalleeSaveFrameSize(kRuntimeISA, type);
+  auto** caller_sp = reinterpret_cast<ArtMethod**>(
+      reinterpret_cast<uintptr_t>(sp) + callee_frame_size);
+  ArtMethod* outer_method = *caller_sp;
+  ArtMethod* caller = outer_method;
+
+  if ((outer_method != nullptr) && outer_method->IsOptimized(sizeof(void*))) {
+    const size_t callee_return_pc_offset = GetCalleeSaveReturnPcOffset(kRuntimeISA, type);
+    uintptr_t caller_pc = *reinterpret_cast<uintptr_t*>(
+        (reinterpret_cast<uint8_t*>(sp) + callee_return_pc_offset));
+    if (LIKELY(caller_pc != reinterpret_cast<uintptr_t>(GetQuickInstrumentationExitPc()))) {
+      uintptr_t native_pc_offset = outer_method->NativeQuickPcOffset(caller_pc);
+      CodeInfo code_info = outer_method->GetOptimizedCodeInfo();
+      StackMapEncoding encoding = code_info.ExtractEncoding();
+      StackMap stack_map = code_info.GetStackMapForNativePcOffset(native_pc_offset, encoding);
+      DCHECK(stack_map.IsValid());
+      if (stack_map.HasInlineInfo(encoding)) {
+        InlineInfo inline_info = code_info.GetInlineInfoOf(stack_map, encoding);
+        caller = GetResolvedMethod(outer_method, inline_info, inline_info.GetDepth() - 1);
+      }
+    } else {
+      // We're instrumenting, just use the StackVisitor which knows how to
+      // handle instrumented frames.
+      NthCallerVisitor visitor(Thread::Current(), 1, true);
+      visitor.WalkStack();
+      caller = visitor.caller;
+      if (kIsDebugBuild) {
+        // Avoid doing the check below.
+        do_caller_check = false;
+      }
+    }
+  }
+
+  if (kIsDebugBuild && do_caller_check) {
+    // Note that do_caller_check is optional, as this method can be called by
+    // stubs, and tests without a proper call stack.
+    NthCallerVisitor visitor(Thread::Current(), 1, true);
+    visitor.WalkStack();
+    CHECK_EQ(caller, visitor.caller);
+  }
+
+  return caller;
+}
+
 }  // namespace art
diff --git a/runtime/entrypoints/entrypoint_utils.h b/runtime/entrypoints/entrypoint_utils.h
index 53f2677..4217cab 100644
--- a/runtime/entrypoints/entrypoint_utils.h
+++ b/runtime/entrypoints/entrypoint_utils.h
@@ -26,6 +26,7 @@
 #include "gc/allocator_type.h"
 #include "invoke_type.h"
 #include "jvalue.h"
+#include "runtime.h"
 
 namespace art {
 
@@ -179,6 +180,10 @@
 template <typename INT_TYPE, typename FLOAT_TYPE>
 inline INT_TYPE art_float_to_integral(FLOAT_TYPE f);
 
+ArtMethod* GetCalleeSaveMethodCaller(ArtMethod** sp,
+                                     Runtime::CalleeSaveType type,
+                                     bool do_caller_check = false);
+
 }  // namespace art
 
 #endif  // ART_RUNTIME_ENTRYPOINTS_ENTRYPOINT_UTILS_H_
diff --git a/runtime/entrypoints/quick/quick_jni_entrypoints.cc b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
index f69c39e..fc5c52e 100644
--- a/runtime/entrypoints/quick/quick_jni_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_jni_entrypoints.cc
@@ -112,4 +112,61 @@
   return JniMethodEndWithReferenceHandleResult(result, saved_local_ref_cookie, self);
 }
 
+extern uint64_t GenericJniMethodEnd(Thread* self,
+                                    uint32_t saved_local_ref_cookie,
+                                    jvalue result,
+                                    uint64_t result_f,
+                                    ArtMethod* called,
+                                    HandleScope* handle_scope)
+    // TODO: NO_THREAD_SAFETY_ANALYSIS as GoToRunnable() is NO_THREAD_SAFETY_ANALYSIS
+    NO_THREAD_SAFETY_ANALYSIS {
+  GoToRunnable(self);
+  // We need the mutator lock (i.e., calling GoToRunnable()) before accessing the shorty or the
+  // locked object.
+  jobject locked = called->IsSynchronized() ? handle_scope->GetHandle(0).ToJObject() : nullptr;
+  char return_shorty_char = called->GetShorty()[0];
+  if (return_shorty_char == 'L') {
+    if (locked != nullptr) {
+      UnlockJniSynchronizedMethod(locked, self);
+    }
+    return reinterpret_cast<uint64_t>(JniMethodEndWithReferenceHandleResult(
+        result.l, saved_local_ref_cookie, self));
+  } else {
+    if (locked != nullptr) {
+      UnlockJniSynchronizedMethod(locked, self);  // Must decode before pop.
+    }
+    PopLocalReferences(saved_local_ref_cookie, self);
+    switch (return_shorty_char) {
+      case 'F': {
+        if (kRuntimeISA == kX86) {
+          // Convert back the result to float.
+          double d = bit_cast<double, uint64_t>(result_f);
+          return bit_cast<uint32_t, float>(static_cast<float>(d));
+        } else {
+          return result_f;
+        }
+      }
+      case 'D':
+        return result_f;
+      case 'Z':
+        return result.z;
+      case 'B':
+        return result.b;
+      case 'C':
+        return result.c;
+      case 'S':
+        return result.s;
+      case 'I':
+        return result.i;
+      case 'J':
+        return result.j;
+      case 'V':
+        return 0;
+      default:
+        LOG(FATAL) << "Unexpected return shorty character " << return_shorty_char;
+        return 0;
+    }
+  }
+}
+
 }  // namespace art
diff --git a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
index 1e9e4fb..5d3ac73 100644
--- a/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
+++ b/runtime/entrypoints/quick/quick_trampoline_entrypoints.cc
@@ -1989,62 +1989,27 @@
                                 reinterpret_cast<uintptr_t>(nativeCode));
 }
 
+// Defined in quick_jni_entrypoints.cc.
+extern uint64_t GenericJniMethodEnd(Thread* self, uint32_t saved_local_ref_cookie,
+                                    jvalue result, uint64_t result_f, ArtMethod* called,
+                                    HandleScope* handle_scope);
 /*
  * Is called after the native JNI code. Responsible for cleanup (handle scope, saved state) and
  * unlocking.
  */
-extern "C" uint64_t artQuickGenericJniEndTrampoline(Thread* self, jvalue result, uint64_t result_f)
-    SHARED_REQUIRES(Locks::mutator_lock_) {
+extern "C" uint64_t artQuickGenericJniEndTrampoline(Thread* self,
+                                                    jvalue result,
+                                                    uint64_t result_f) {
+  // We're here just back from a native call. We don't have the shared mutator lock at this point
+  // yet until we call GoToRunnable() later in GenericJniMethodEnd(). Accessing objects or doing
+  // anything that requires a mutator lock before that would cause problems as GC may have the
+  // exclusive mutator lock and may be moving objects, etc.
   ArtMethod** sp = self->GetManagedStack()->GetTopQuickFrame();
   uint32_t* sp32 = reinterpret_cast<uint32_t*>(sp);
   ArtMethod* called = *sp;
   uint32_t cookie = *(sp32 - 1);
-
-  jobject lock = nullptr;
-  if (called->IsSynchronized()) {
-    HandleScope* table = reinterpret_cast<HandleScope*>(reinterpret_cast<uint8_t*>(sp)
-        + sizeof(*sp));
-    lock = table->GetHandle(0).ToJObject();
-  }
-
-  char return_shorty_char = called->GetShorty()[0];
-
-  if (return_shorty_char == 'L') {
-    return artQuickGenericJniEndJNIRef(self, cookie, result.l, lock);
-  } else {
-    artQuickGenericJniEndJNINonRef(self, cookie, lock);
-
-    switch (return_shorty_char) {
-      case 'F': {
-        if (kRuntimeISA == kX86) {
-          // Convert back the result to float.
-          double d = bit_cast<double, uint64_t>(result_f);
-          return bit_cast<uint32_t, float>(static_cast<float>(d));
-        } else {
-          return result_f;
-        }
-      }
-      case 'D':
-        return result_f;
-      case 'Z':
-        return result.z;
-      case 'B':
-        return result.b;
-      case 'C':
-        return result.c;
-      case 'S':
-        return result.s;
-      case 'I':
-        return result.i;
-      case 'J':
-        return result.j;
-      case 'V':
-        return 0;
-      default:
-        LOG(FATAL) << "Unexpected return shorty character " << return_shorty_char;
-        return 0;
-    }
-  }
+  HandleScope* table = reinterpret_cast<HandleScope*>(reinterpret_cast<uint8_t*>(sp) + sizeof(*sp));
+  return GenericJniMethodEnd(self, cookie, result, result_f, called, table);
 }
 
 // We use TwoWordReturn to optimize scalar returns. We use the hi value for code, and the lo value
diff --git a/runtime/gc/allocator/rosalloc-inl.h b/runtime/gc/allocator/rosalloc-inl.h
index 25fdd7c..2510514 100644
--- a/runtime/gc/allocator/rosalloc-inl.h
+++ b/runtime/gc/allocator/rosalloc-inl.h
@@ -53,13 +53,7 @@
 }
 
 inline bool RosAlloc::Run::IsFull() {
-  const size_t num_vec = NumberOfBitmapVectors();
-  for (size_t v = 0; v < num_vec; ++v) {
-    if (~alloc_bit_map_[v] != 0) {
-      return false;
-    }
-  }
-  return true;
+  return free_list_.Size() == 0;
 }
 
 inline bool RosAlloc::CanAllocFromThreadLocalRun(Thread* self, size_t size) {
@@ -120,45 +114,14 @@
 }
 
 inline void* RosAlloc::Run::AllocSlot() {
-  const size_t idx = size_bracket_idx_;
-  while (true) {
-    if (kIsDebugBuild) {
-      // Make sure that no slots leaked, the bitmap should be full for all previous vectors.
-      for (size_t i = 0; i < first_search_vec_idx_; ++i) {
-        CHECK_EQ(~alloc_bit_map_[i], 0U);
-      }
-    }
-    uint32_t* const alloc_bitmap_ptr = &alloc_bit_map_[first_search_vec_idx_];
-    uint32_t ffz1 = __builtin_ffs(~*alloc_bitmap_ptr);
-    if (LIKELY(ffz1 != 0)) {
-      const uint32_t ffz = ffz1 - 1;
-      const uint32_t slot_idx = ffz +
-          first_search_vec_idx_ * sizeof(*alloc_bitmap_ptr) * kBitsPerByte;
-      const uint32_t mask = 1U << ffz;
-      DCHECK_LT(slot_idx, numOfSlots[idx]) << "out of range";
-      // Found an empty slot. Set the bit.
-      DCHECK_EQ(*alloc_bitmap_ptr & mask, 0U);
-      *alloc_bitmap_ptr |= mask;
-      DCHECK_NE(*alloc_bitmap_ptr & mask, 0U);
-      uint8_t* slot_addr = reinterpret_cast<uint8_t*>(this) +
-          headerSizes[idx] + slot_idx * bracketSizes[idx];
-      if (kTraceRosAlloc) {
-        LOG(INFO) << "RosAlloc::Run::AllocSlot() : 0x" << std::hex
-                  << reinterpret_cast<intptr_t>(slot_addr)
-                  << ", bracket_size=" << std::dec << bracketSizes[idx]
-                  << ", slot_idx=" << slot_idx;
-      }
-      return slot_addr;
-    }
-    const size_t num_words = RoundUp(numOfSlots[idx], 32) / 32;
-    if (first_search_vec_idx_ + 1 >= num_words) {
-      DCHECK(IsFull());
-      // Already at the last word, return null.
-      return nullptr;
-    }
-    // Increase the index to the next word and try again.
-    ++first_search_vec_idx_;
+  Slot* slot = free_list_.Remove();
+  if (kTraceRosAlloc && slot != nullptr) {
+    const uint8_t idx = size_bracket_idx_;
+    LOG(INFO) << "RosAlloc::Run::AllocSlot() : " << slot
+              << ", bracket_size=" << std::dec << bracketSizes[idx]
+              << ", slot_idx=" << SlotIndex(slot);
   }
+  return slot;
 }
 
 }  // namespace allocator
diff --git a/runtime/gc/allocator/rosalloc.cc b/runtime/gc/allocator/rosalloc.cc
index 470bc1c..9c8e4df 100644
--- a/runtime/gc/allocator/rosalloc.cc
+++ b/runtime/gc/allocator/rosalloc.cc
@@ -35,7 +35,7 @@
 namespace gc {
 namespace allocator {
 
-static constexpr bool kUsePrefetchDuringAllocRun = true;
+static constexpr bool kUsePrefetchDuringAllocRun = false;
 static constexpr bool kPrefetchNewRunDataByZeroing = false;
 static constexpr size_t kPrefetchStride = 64;
 
@@ -43,8 +43,6 @@
 size_t RosAlloc::numOfPages[kNumOfSizeBrackets];
 size_t RosAlloc::numOfSlots[kNumOfSizeBrackets];
 size_t RosAlloc::headerSizes[kNumOfSizeBrackets];
-size_t RosAlloc::bulkFreeBitMapOffsets[kNumOfSizeBrackets];
-size_t RosAlloc::threadLocalFreeBitMapOffsets[kNumOfSizeBrackets];
 bool RosAlloc::initialized_ = false;
 size_t RosAlloc::dedicated_full_run_storage_[kPageSize / sizeof(size_t)] = { 0 };
 RosAlloc::Run* RosAlloc::dedicated_full_run_ =
@@ -556,9 +554,7 @@
       new_run->magic_num_ = kMagicNum;
     }
     new_run->size_bracket_idx_ = idx;
-    new_run->SetAllocBitMapBitsForInvalidSlots();
     DCHECK(!new_run->IsThreadLocal());
-    DCHECK_EQ(new_run->first_search_vec_idx_, 0U);
     DCHECK(!new_run->to_be_bulk_freed_);
     if (kUsePrefetchDuringAllocRun && idx < kNumThreadLocalSizeBrackets) {
       // Take ownership of the cache lines if we are likely to be thread local run.
@@ -576,6 +572,7 @@
         }
       }
     }
+    new_run->InitFreeList();
   }
   return new_run;
 }
@@ -695,15 +692,11 @@
       MutexLock mu(self, *size_bracket_locks_[idx]);
       bool is_all_free_after_merge;
       // This is safe to do for the dedicated_full_run_ since the bitmaps are empty.
-      if (thread_local_run->MergeThreadLocalFreeBitMapToAllocBitMap(&is_all_free_after_merge)) {
+      if (thread_local_run->MergeThreadLocalFreeListToFreeList(&is_all_free_after_merge)) {
         DCHECK_NE(thread_local_run, dedicated_full_run_);
         // Some slot got freed. Keep it.
         DCHECK(!thread_local_run->IsFull());
         DCHECK_EQ(is_all_free_after_merge, thread_local_run->IsAllFree());
-        if (is_all_free_after_merge) {
-          // Check that the bitmap idx is back at 0 if it's all free.
-          DCHECK_EQ(thread_local_run->first_search_vec_idx_, 0U);
-        }
       } else {
         // No slots got freed. Try to refill the thread-local run.
         DCHECK(thread_local_run->IsFull());
@@ -792,7 +785,7 @@
     DCHECK_LT(run->size_bracket_idx_, kNumThreadLocalSizeBrackets);
     DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end());
     DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end());
-    run->MarkThreadLocalFreeBitMap(ptr);
+    run->AddToThreadLocalFreeList(ptr);
     if (kTraceRosAlloc) {
       LOG(INFO) << "RosAlloc::FreeFromRun() : Freed a slot in a thread local run 0x" << std::hex
                 << reinterpret_cast<intptr_t>(run);
@@ -818,7 +811,7 @@
     }
     DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end());
     DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end());
-    run->ZeroHeader();
+    run->ZeroHeaderAndSlotHeaders();
     {
       MutexLock lock_mu(self, lock_);
       FreePages(self, run, true);
@@ -853,271 +846,145 @@
   return bracket_size;
 }
 
-std::string RosAlloc::Run::BitMapToStr(uint32_t* bit_map_base, size_t num_vec) {
-  std::string bit_map_str;
-  for (size_t v = 0; v < num_vec; v++) {
-    uint32_t vec = bit_map_base[v];
-    if (v != num_vec - 1) {
-      bit_map_str.append(StringPrintf("%x-", vec));
+template<bool kUseTail>
+std::string RosAlloc::Run::FreeListToStr(SlotFreeList<kUseTail>* free_list) {
+  std::string free_list_str;
+  const uint8_t idx = size_bracket_idx_;
+  const size_t bracket_size = bracketSizes[idx];
+  for (Slot* slot = free_list->Head(); slot != nullptr; slot = slot->Next()) {
+    bool is_last = slot->Next() == nullptr;
+    uintptr_t slot_offset = reinterpret_cast<uintptr_t>(slot) -
+        reinterpret_cast<uintptr_t>(FirstSlot());
+    DCHECK_EQ(slot_offset % bracket_size, 0U);
+    uintptr_t slot_idx = slot_offset / bracket_size;
+    if (!is_last) {
+      free_list_str.append(StringPrintf("%u-", static_cast<uint32_t>(slot_idx)));
     } else {
-      bit_map_str.append(StringPrintf("%x", vec));
+      free_list_str.append(StringPrintf("%u", static_cast<uint32_t>(slot_idx)));
     }
   }
-  return bit_map_str.c_str();
+  return free_list_str;
 }
 
 std::string RosAlloc::Run::Dump() {
   size_t idx = size_bracket_idx_;
-  size_t num_slots = numOfSlots[idx];
-  size_t num_vec = RoundUp(num_slots, 32) / 32;
   std::ostringstream stream;
   stream << "RosAlloc Run = " << reinterpret_cast<void*>(this)
          << "{ magic_num=" << static_cast<int>(magic_num_)
          << " size_bracket_idx=" << idx
          << " is_thread_local=" << static_cast<int>(is_thread_local_)
          << " to_be_bulk_freed=" << static_cast<int>(to_be_bulk_freed_)
-         << " first_search_vec_idx=" << first_search_vec_idx_
-         << " alloc_bit_map=" << BitMapToStr(alloc_bit_map_, num_vec)
-         << " bulk_free_bit_map=" << BitMapToStr(BulkFreeBitMap(), num_vec)
-         << " thread_local_bit_map=" << BitMapToStr(ThreadLocalFreeBitMap(), num_vec)
+         << " free_list=" << FreeListToStr(&free_list_)
+         << " bulk_free_list=" << FreeListToStr(&bulk_free_list_)
+         << " thread_local_list=" << FreeListToStr(&thread_local_free_list_)
          << " }" << std::endl;
   return stream.str();
 }
 
+inline size_t RosAlloc::Run::SlotIndex(Slot* slot) {
+  const uint8_t idx = size_bracket_idx_;
+  const size_t bracket_size = bracketSizes[idx];
+  const size_t offset_from_slot_base = reinterpret_cast<uint8_t*>(slot)
+      - reinterpret_cast<uint8_t*>(FirstSlot());
+  DCHECK_EQ(offset_from_slot_base % bracket_size, static_cast<size_t>(0));
+  size_t slot_idx = offset_from_slot_base / bracket_size;
+  DCHECK_LT(slot_idx, numOfSlots[idx]);
+  return slot_idx;
+}
+
 void RosAlloc::Run::FreeSlot(void* ptr) {
   DCHECK(!IsThreadLocal());
   const uint8_t idx = size_bracket_idx_;
   const size_t bracket_size = bracketSizes[idx];
-  const size_t offset_from_slot_base = reinterpret_cast<uint8_t*>(ptr)
-      - (reinterpret_cast<uint8_t*>(this) + headerSizes[idx]);
-  DCHECK_EQ(offset_from_slot_base % bracket_size, static_cast<size_t>(0));
-  size_t slot_idx = offset_from_slot_base / bracket_size;
-  DCHECK_LT(slot_idx, numOfSlots[idx]);
-  size_t vec_idx = slot_idx / 32;
-  if (kIsDebugBuild) {
-    size_t num_vec = RoundUp(numOfSlots[idx], 32) / 32;
-    DCHECK_LT(vec_idx, num_vec);
-  }
-  size_t vec_off = slot_idx % 32;
-  uint32_t* vec = &alloc_bit_map_[vec_idx];
-  first_search_vec_idx_ = std::min(first_search_vec_idx_, static_cast<uint32_t>(vec_idx));
-  const uint32_t mask = 1U << vec_off;
-  DCHECK_NE(*vec & mask, 0U);
-  *vec &= ~mask;
-  DCHECK_EQ(*vec & mask, 0U);
+  Slot* slot = ToSlot(ptr);
   // Zero out the memory.
   // TODO: Investigate alternate memset since ptr is guaranteed to be aligned to 16.
-  memset(ptr, 0, bracket_size);
+  memset(slot, 0, bracket_size);
+  free_list_.Add(slot);
   if (kTraceRosAlloc) {
-    LOG(INFO) << "RosAlloc::Run::FreeSlot() : 0x" << std::hex << reinterpret_cast<intptr_t>(ptr)
-              << ", bracket_size=" << std::dec << bracketSizes[idx] << ", slot_idx=" << slot_idx;
+    LOG(INFO) << "RosAlloc::Run::FreeSlot() : " << slot
+              << ", bracket_size=" << std::dec << bracket_size << ", slot_idx=" << SlotIndex(slot);
   }
 }
 
-size_t RosAlloc::Run::NumberOfFreeSlots() {
-  size_t num_alloc_slots = 0;
-  const size_t idx = size_bracket_idx_;
-  const size_t num_slots = numOfSlots[idx];
-  const size_t num_vec = RoundUp(num_slots, 32) / 32;
-  DCHECK_NE(num_vec, 0U);
-  for (size_t v = 0; v < num_vec - 1; v++) {
-    num_alloc_slots += POPCOUNT(alloc_bit_map_[v]);
-  }
-  // Don't count the invalid bits in the last vector.
-  uint32_t last_vec_masked = alloc_bit_map_[num_vec - 1] &
-      ~GetBitmapLastVectorMask(num_slots, num_vec);
-  num_alloc_slots += POPCOUNT(last_vec_masked);
-  size_t num_free_slots = num_slots - num_alloc_slots;
-  DCHECK_LE(num_alloc_slots, num_slots);
-  DCHECK_LE(num_free_slots, num_slots);
-  return num_free_slots;
-}
-
-inline bool RosAlloc::Run::MergeThreadLocalFreeBitMapToAllocBitMap(bool* is_all_free_after_out) {
+inline bool RosAlloc::Run::MergeThreadLocalFreeListToFreeList(bool* is_all_free_after_out) {
   DCHECK(IsThreadLocal());
-  // Free slots in the alloc bit map based on the thread local free bit map.
-  const size_t idx = size_bracket_idx_;
-  const size_t num_of_slots = numOfSlots[idx];
-  const size_t num_vec = RoundUp(num_of_slots, 32) / 32;
-  bool changed = false;
-  uint32_t* vecp = &alloc_bit_map_[0];
-  uint32_t* tl_free_vecp = &ThreadLocalFreeBitMap()[0];
-  bool is_all_free_after = true;
-  for (size_t v = 0; v < num_vec; v++, vecp++, tl_free_vecp++) {
-    uint32_t tl_free_vec = *tl_free_vecp;
-    uint32_t vec_before = *vecp;
-    uint32_t vec_after;
-    if (tl_free_vec != 0) {
-      first_search_vec_idx_ = std::min(first_search_vec_idx_, static_cast<uint32_t>(v));
-      vec_after = vec_before & ~tl_free_vec;
-      *vecp = vec_after;
-      changed = true;
-      *tl_free_vecp = 0;  // clear the thread local free bit map.
-    } else {
-      vec_after = vec_before;
-    }
-    if (vec_after != 0) {
-      if (v == num_vec - 1) {
-        // Only not all free if a bit other than the mask bits are set.
-        is_all_free_after =
-            is_all_free_after && GetBitmapLastVectorMask(num_of_slots, num_vec) == vec_after;
-      } else {
-        is_all_free_after = false;
-      }
-    }
-    DCHECK_EQ(*tl_free_vecp, static_cast<uint32_t>(0));
-  }
-  *is_all_free_after_out = is_all_free_after;
-  // Return true if there was at least a bit set in the thread-local
-  // free bit map and at least a bit in the alloc bit map changed.
-  return changed;
-}
-
-inline void RosAlloc::Run::MergeBulkFreeBitMapIntoAllocBitMap() {
-  DCHECK(!IsThreadLocal());
-  // Free slots in the alloc bit map based on the bulk free bit map.
-  const size_t num_vec = NumberOfBitmapVectors();
-  uint32_t* vecp = &alloc_bit_map_[0];
-  uint32_t* free_vecp = &BulkFreeBitMap()[0];
-  for (size_t v = 0; v < num_vec; v++, vecp++, free_vecp++) {
-    uint32_t free_vec = *free_vecp;
-    if (free_vec != 0) {
-      first_search_vec_idx_ = std::min(first_search_vec_idx_, static_cast<uint32_t>(v));
-      *vecp &= ~free_vec;
-      *free_vecp = 0;  // clear the bulk free bit map.
-    }
-    DCHECK_EQ(*free_vecp, static_cast<uint32_t>(0));
-  }
-}
-
-inline void RosAlloc::Run::UnionBulkFreeBitMapToThreadLocalFreeBitMap() {
-  DCHECK(IsThreadLocal());
-  // Union the thread local bit map with the bulk free bit map.
-  size_t num_vec = NumberOfBitmapVectors();
-  uint32_t* to_vecp = &ThreadLocalFreeBitMap()[0];
-  uint32_t* from_vecp = &BulkFreeBitMap()[0];
-  for (size_t v = 0; v < num_vec; v++, to_vecp++, from_vecp++) {
-    uint32_t from_vec = *from_vecp;
-    if (from_vec != 0) {
-      *to_vecp |= from_vec;
-      *from_vecp = 0;  // clear the bulk free bit map.
-    }
-    DCHECK_EQ(*from_vecp, static_cast<uint32_t>(0));
-  }
-}
-
-inline void RosAlloc::Run::MarkThreadLocalFreeBitMap(void* ptr) {
-  DCHECK(IsThreadLocal());
-  MarkFreeBitMapShared(ptr, ThreadLocalFreeBitMap(), "MarkThreadLocalFreeBitMap");
-}
-
-inline size_t RosAlloc::Run::MarkBulkFreeBitMap(void* ptr) {
-  return MarkFreeBitMapShared(ptr, BulkFreeBitMap(), "MarkFreeBitMap");
-}
-
-inline size_t RosAlloc::Run::MarkFreeBitMapShared(void* ptr, uint32_t* free_bit_map_base,
-                                                  const char* caller_name) {
+  // Merge the thread local free list into the free list and clear the thread local free list.
   const uint8_t idx = size_bracket_idx_;
-  const size_t offset_from_slot_base = reinterpret_cast<uint8_t*>(ptr)
-      - (reinterpret_cast<uint8_t*>(this) + headerSizes[idx]);
+  bool thread_local_free_list_size = thread_local_free_list_.Size();
+  const size_t size_before = free_list_.Size();
+  free_list_.Merge(&thread_local_free_list_);
+  const size_t size_after = free_list_.Size();
+  DCHECK_EQ(size_before < size_after, thread_local_free_list_size > 0);
+  DCHECK_LE(size_before, size_after);
+  *is_all_free_after_out = free_list_.Size() == numOfSlots[idx];
+  // Return true at least one slot was added to the free list.
+  return size_before < size_after;
+}
+
+inline void RosAlloc::Run::MergeBulkFreeListToFreeList() {
+  DCHECK(!IsThreadLocal());
+  // Merge the bulk free list into the free list and clear the bulk free list.
+  free_list_.Merge(&bulk_free_list_);
+}
+
+inline void RosAlloc::Run::MergeBulkFreeListToThreadLocalFreeList() {
+  DCHECK(IsThreadLocal());
+  // Merge the bulk free list into the thread local free list and clear the bulk free list.
+  thread_local_free_list_.Merge(&bulk_free_list_);
+}
+
+inline void RosAlloc::Run::AddToThreadLocalFreeList(void* ptr) {
+  DCHECK(IsThreadLocal());
+  AddToFreeListShared(ptr, &thread_local_free_list_, __FUNCTION__);
+}
+
+inline size_t RosAlloc::Run::AddToBulkFreeList(void* ptr) {
+  return AddToFreeListShared(ptr, &bulk_free_list_, __FUNCTION__);
+}
+
+inline size_t RosAlloc::Run::AddToFreeListShared(void* ptr,
+                                                 SlotFreeList<true>* free_list,
+                                                 const char* caller_name) {
+  const uint8_t idx = size_bracket_idx_;
   const size_t bracket_size = bracketSizes[idx];
-  memset(ptr, 0, bracket_size);
-  DCHECK_EQ(offset_from_slot_base % bracket_size, static_cast<size_t>(0));
-  size_t slot_idx = offset_from_slot_base / bracket_size;
-  DCHECK_LT(slot_idx, numOfSlots[idx]);
-  size_t vec_idx = slot_idx / 32;
-  if (kIsDebugBuild) {
-    size_t num_vec = NumberOfBitmapVectors();
-    DCHECK_LT(vec_idx, num_vec);
-  }
-  size_t vec_off = slot_idx % 32;
-  uint32_t* vec = &free_bit_map_base[vec_idx];
-  const uint32_t mask = 1U << vec_off;
-  DCHECK_EQ(*vec & mask, 0U);
-  *vec |= mask;
-  DCHECK_NE(*vec & mask, 0U);
+  Slot* slot = ToSlot(ptr);
+  memset(slot, 0, bracket_size);
+  free_list->Add(slot);
   if (kTraceRosAlloc) {
-    LOG(INFO) << "RosAlloc::Run::" << caller_name << "() : 0x" << std::hex
-              << reinterpret_cast<intptr_t>(ptr)
-              << ", bracket_size=" << std::dec << bracketSizes[idx] << ", slot_idx=" << slot_idx;
+    LOG(INFO) << "RosAlloc::Run::" << caller_name << "() : " << ptr
+              << ", bracket_size=" << std::dec << bracket_size << ", slot_idx=" << SlotIndex(slot);
   }
   return bracket_size;
 }
 
-inline uint32_t RosAlloc::Run::GetBitmapLastVectorMask(size_t num_slots, size_t num_vec) {
-  const size_t kBitsPerVec = 32;
-  DCHECK_GE(num_vec * kBitsPerVec, num_slots);
-  DCHECK_NE(num_vec, 0U);
-  size_t remain = num_vec * kBitsPerVec - num_slots;
-  DCHECK_LT(remain, kBitsPerVec);
-  return ((1U << remain) - 1) << ((kBitsPerVec - remain) & 0x1F);
-}
-
-inline bool RosAlloc::Run::IsAllFree() {
+inline void RosAlloc::Run::ZeroHeaderAndSlotHeaders() {
+  DCHECK(IsAllFree());
   const uint8_t idx = size_bracket_idx_;
-  const size_t num_slots = numOfSlots[idx];
-  const size_t num_vec = NumberOfBitmapVectors();
-  DCHECK_NE(num_vec, 0U);
-  // Check the last vector after the loop since it uses a special case for the masked bits.
-  for (size_t v = 0; v < num_vec - 1; v++) {
-    uint32_t vec = alloc_bit_map_[v];
-    if (vec != 0) {
-      return false;
-    }
+  // Zero the slot header (next pointers).
+  for (Slot* slot = free_list_.Head(); slot != nullptr; ) {
+    Slot* next_slot = slot->Next();
+    slot->Clear();
+    slot = next_slot;
   }
-  // Make sure the last word is equal to the mask, all other bits must be 0.
-  return alloc_bit_map_[num_vec - 1] == GetBitmapLastVectorMask(num_slots, num_vec);
-}
-
-inline bool RosAlloc::Run::IsBulkFreeBitmapClean() {
-  const size_t num_vec = NumberOfBitmapVectors();
-  for (size_t v = 0; v < num_vec; v++) {
-    uint32_t vec = BulkFreeBitMap()[v];
-    if (vec != 0) {
-      return false;
-    }
-  }
-  return true;
-}
-
-inline bool RosAlloc::Run::IsThreadLocalFreeBitmapClean() {
-  const size_t num_vec = NumberOfBitmapVectors();
-  for (size_t v = 0; v < num_vec; v++) {
-    uint32_t vec = ThreadLocalFreeBitMap()[v];
-    if (vec != 0) {
-      return false;
-    }
-  }
-  return true;
-}
-
-inline void RosAlloc::Run::SetAllocBitMapBitsForInvalidSlots() {
-  const size_t idx = size_bracket_idx_;
-  const size_t num_slots = numOfSlots[idx];
-  const size_t num_vec = RoundUp(num_slots, 32) / 32;
-  DCHECK_NE(num_vec, 0U);
-  // Make sure to set the bits at the end of the bitmap so that we don't allocate there since they
-  // don't represent valid slots.
-  alloc_bit_map_[num_vec - 1] |= GetBitmapLastVectorMask(num_slots, num_vec);
-}
-
-inline void RosAlloc::Run::ZeroHeader() {
-  const uint8_t idx = size_bracket_idx_;
+  // Zero the header.
   memset(this, 0, headerSizes[idx]);
+  // Check that the entire run is all zero.
+  if (kIsDebugBuild) {
+    const size_t size = numOfPages[idx] * kPageSize;
+    const uintptr_t* word_ptr = reinterpret_cast<uintptr_t*>(this);
+    for (size_t i = 0; i < size / sizeof(uintptr_t); ++i) {
+      CHECK_EQ(word_ptr[i], 0U) << "words don't match at index " << i;
+    }
+  }
 }
 
 inline void RosAlloc::Run::ZeroData() {
   const uint8_t idx = size_bracket_idx_;
-  uint8_t* slot_begin = reinterpret_cast<uint8_t*>(this) + headerSizes[idx];
+  uint8_t* slot_begin = reinterpret_cast<uint8_t*>(FirstSlot());
   memset(slot_begin, 0, numOfSlots[idx] * bracketSizes[idx]);
 }
 
-inline void RosAlloc::Run::FillAllocBitMap() {
-  size_t num_vec = NumberOfBitmapVectors();
-  memset(alloc_bit_map_, 0xFF, sizeof(uint32_t) * num_vec);
-  first_search_vec_idx_ = num_vec - 1;  // No free bits in any of the bitmap words.
-}
-
 void RosAlloc::Run::InspectAllSlots(void (*handler)(void* start, void* end, size_t used_bytes, void* callback_arg),
                                     void* arg) {
   size_t idx = size_bracket_idx_;
@@ -1126,26 +993,27 @@
   size_t bracket_size = IndexToBracketSize(idx);
   DCHECK_EQ(slot_base + num_slots * bracket_size,
             reinterpret_cast<uint8_t*>(this) + numOfPages[idx] * kPageSize);
-  size_t num_vec = RoundUp(num_slots, 32) / 32;
-  size_t slots = 0;
-  const uint32_t* const tl_free_vecp = IsThreadLocal() ? ThreadLocalFreeBitMap() : nullptr;
-  for (size_t v = 0; v < num_vec; v++, slots += 32) {
-    DCHECK_GE(num_slots, slots);
-    uint32_t vec = alloc_bit_map_[v];
-    if (tl_free_vecp != nullptr) {
-      // Clear out the set bits in the thread local free bitmap since these aren't actually
-      // allocated.
-      vec &= ~tl_free_vecp[v];
+  // Free slots are on the free list and the allocated/used slots are not. We traverse the free list
+  // to find out and record which slots are free in the is_free array.
+  std::unique_ptr<bool[]> is_free(new bool[num_slots]());  // zero initialized
+  for (Slot* slot = free_list_.Head(); slot != nullptr; slot = slot->Next()) {
+    size_t slot_idx = SlotIndex(slot);
+    DCHECK_LT(slot_idx, num_slots);
+    is_free[slot_idx] = true;
+  }
+  if (IsThreadLocal()) {
+    for (Slot* slot = thread_local_free_list_.Head(); slot != nullptr; slot = slot->Next()) {
+      size_t slot_idx = SlotIndex(slot);
+      DCHECK_LT(slot_idx, num_slots);
+      is_free[slot_idx] = true;
     }
-    size_t end = std::min(num_slots - slots, static_cast<size_t>(32));
-    for (size_t i = 0; i < end; ++i) {
-      bool is_allocated = ((vec >> i) & 0x1) != 0;
-      uint8_t* slot_addr = slot_base + (slots + i) * bracket_size;
-      if (is_allocated) {
-        handler(slot_addr, slot_addr + bracket_size, bracket_size, arg);
-      } else {
-        handler(slot_addr, slot_addr + bracket_size, 0, arg);
-      }
+  }
+  for (size_t slot_idx = 0; slot_idx < num_slots; ++slot_idx) {
+    uint8_t* slot_addr = slot_base + slot_idx * bracket_size;
+    if (!is_free[slot_idx]) {
+      handler(slot_addr, slot_addr + bracket_size, bracket_size, arg);
+    } else {
+      handler(slot_addr, slot_addr + bracket_size, 0, arg);
     }
   }
 }
@@ -1236,7 +1104,7 @@
     DCHECK(run != nullptr);
     DCHECK_EQ(run->magic_num_, kMagicNum);
     // Set the bit in the bulk free bit map.
-    freed_bytes += run->MarkBulkFreeBitMap(ptr);
+    freed_bytes += run->AddToBulkFreeList(ptr);
 #ifdef __ANDROID__
     if (!run->to_be_bulk_freed_) {
       run->to_be_bulk_freed_ = true;
@@ -1262,7 +1130,7 @@
       DCHECK_LT(run->size_bracket_idx_, kNumThreadLocalSizeBrackets);
       DCHECK(non_full_runs_[idx].find(run) == non_full_runs_[idx].end());
       DCHECK(full_runs_[idx].find(run) == full_runs_[idx].end());
-      run->UnionBulkFreeBitMapToThreadLocalFreeBitMap();
+      run->MergeBulkFreeListToThreadLocalFreeList();
       if (kTraceRosAlloc) {
         LOG(INFO) << "RosAlloc::BulkFree() : Freed slot(s) in a thread local run 0x"
                   << std::hex << reinterpret_cast<intptr_t>(run);
@@ -1272,7 +1140,7 @@
       // it's become all free.
     } else {
       bool run_was_full = run->IsFull();
-      run->MergeBulkFreeBitMapIntoAllocBitMap();
+      run->MergeBulkFreeListToFreeList();
       if (kTraceRosAlloc) {
         LOG(INFO) << "RosAlloc::BulkFree() : Freed slot(s) in a run 0x" << std::hex
                   << reinterpret_cast<intptr_t>(run);
@@ -1316,7 +1184,7 @@
           DCHECK(non_full_runs->find(run) == non_full_runs->end());
         }
         if (!run_was_current) {
-          run->ZeroHeader();
+          run->ZeroHeaderAndSlotHeaders();
           MutexLock lock_mu(self, lock_);
           FreePages(self, run, true);
         }
@@ -1677,9 +1545,9 @@
       size_t num_free_slots = thread_local_run->NumberOfFreeSlots();
       free_bytes += num_free_slots * bracketSizes[idx];
       bool dont_care;
-      thread_local_run->MergeThreadLocalFreeBitMapToAllocBitMap(&dont_care);
+      thread_local_run->MergeThreadLocalFreeListToFreeList(&dont_care);
       thread_local_run->SetIsThreadLocal(false);
-      thread_local_run->MergeBulkFreeBitMapIntoAllocBitMap();
+      thread_local_run->MergeBulkFreeListToFreeList();
       DCHECK(non_full_runs_[idx].find(thread_local_run) == non_full_runs_[idx].end());
       DCHECK(full_runs_[idx].find(thread_local_run) == full_runs_[idx].end());
       RevokeRun(self, idx, thread_local_run);
@@ -1702,7 +1570,7 @@
       }
     }
   } else if (run->IsAllFree()) {
-    run->ZeroHeader();
+    run->ZeroHeaderAndSlotHeaders();
     MutexLock mu(self, lock_);
     FreePages(self, run, true);
   } else {
@@ -1814,22 +1682,15 @@
     size_t max_num_of_slots = run_size / bracket_size;
     // Compute the actual number of slots by taking the header and
     // alignment into account.
-    size_t fixed_header_size = RoundUp(Run::fixed_header_size(), sizeof(uint32_t));
-    DCHECK_EQ(fixed_header_size, static_cast<size_t>(8));
+    size_t fixed_header_size = RoundUp(Run::fixed_header_size(), sizeof(uint64_t));
+    DCHECK_EQ(fixed_header_size, 80U);
     size_t header_size = 0;
-    size_t bulk_free_bit_map_offset = 0;
-    size_t thread_local_free_bit_map_offset = 0;
     size_t num_of_slots = 0;
     // Search for the maximum number of slots that allows enough space
-    // for the header (including the bit maps.)
+    // for the header.
     for (int s = max_num_of_slots; s >= 0; s--) {
       size_t tmp_slots_size = bracket_size * s;
-      size_t tmp_bit_map_size = RoundUp(s, sizeof(uint32_t) * kBitsPerByte) / kBitsPerByte;
-      size_t tmp_bulk_free_bit_map_size = tmp_bit_map_size;
-      size_t tmp_bulk_free_bit_map_off = fixed_header_size + tmp_bit_map_size;
-      size_t tmp_thread_local_free_bit_map_size = tmp_bit_map_size;
-      size_t tmp_thread_local_free_bit_map_off = tmp_bulk_free_bit_map_off + tmp_bulk_free_bit_map_size;
-      size_t tmp_unaligned_header_size = tmp_thread_local_free_bit_map_off + tmp_thread_local_free_bit_map_size;
+      size_t tmp_unaligned_header_size = fixed_header_size;
       // Align up the unaligned header size. bracket_size may not be a power of two.
       size_t tmp_header_size = (tmp_unaligned_header_size % bracket_size == 0) ?
           tmp_unaligned_header_size :
@@ -1841,24 +1702,19 @@
         // space for the header (including the bit maps.)
         num_of_slots = s;
         header_size = tmp_header_size;
-        bulk_free_bit_map_offset = tmp_bulk_free_bit_map_off;
-        thread_local_free_bit_map_offset = tmp_thread_local_free_bit_map_off;
         break;
       }
     }
-    DCHECK(num_of_slots > 0 && header_size > 0 && bulk_free_bit_map_offset > 0);
+    DCHECK_GT(num_of_slots, 0U);
+    DCHECK_GT(header_size, 0U);
     // Add the padding for the alignment remainder.
     header_size += run_size % bracket_size;
     DCHECK_EQ(header_size + num_of_slots * bracket_size, run_size);
     numOfSlots[i] = num_of_slots;
     headerSizes[i] = header_size;
-    bulkFreeBitMapOffsets[i] = bulk_free_bit_map_offset;
-    threadLocalFreeBitMapOffsets[i] = thread_local_free_bit_map_offset;
     if (kTraceRosAlloc) {
       LOG(INFO) << "numOfSlots[" << i << "]=" << numOfSlots[i]
-                << ", headerSizes[" << i << "]=" << headerSizes[i]
-                << ", bulkFreeBitMapOffsets[" << i << "]=" << bulkFreeBitMapOffsets[i]
-                << ", threadLocalFreeBitMapOffsets[" << i << "]=" << threadLocalFreeBitMapOffsets[i];;
+                << ", headerSizes[" << i << "]=" << headerSizes[i];
     }
   }
   // Fill the alloc bitmap so nobody can successfully allocate from it.
@@ -1868,8 +1724,11 @@
   // It doesn't matter which size bracket we use since the main goal is to have the allocation
   // fail 100% of the time you attempt to allocate into the dedicated full run.
   dedicated_full_run_->size_bracket_idx_ = 0;
-  dedicated_full_run_->FillAllocBitMap();
+  DCHECK_EQ(dedicated_full_run_->FreeList()->Size(), 0U);  // It looks full.
   dedicated_full_run_->SetIsThreadLocal(true);
+
+  // The smallest bracket size must be at least as large as the sizeof(Slot).
+  DCHECK_LE(sizeof(Slot), bracketSizes[0]) << "sizeof(Slot) <= the smallest bracket size";
 }
 
 void RosAlloc::BytesAllocatedCallback(void* start ATTRIBUTE_UNUSED, void* end ATTRIBUTE_UNUSED,
@@ -2025,19 +1884,12 @@
   CHECK_LT(idx, kNumOfSizeBrackets) << "Out of range size bracket index : " << Dump();
   uint8_t* slot_base = reinterpret_cast<uint8_t*>(this) + headerSizes[idx];
   const size_t num_slots = numOfSlots[idx];
-  const size_t num_vec = RoundUp(num_slots, 32) / 32;
-  CHECK_GT(num_vec, 0U);
   size_t bracket_size = IndexToBracketSize(idx);
   CHECK_EQ(slot_base + num_slots * bracket_size,
            reinterpret_cast<uint8_t*>(this) + numOfPages[idx] * kPageSize)
       << "Mismatch in the end address of the run " << Dump();
-  // Check that the bulk free bitmap is clean. It's only used during BulkFree().
-  CHECK(IsBulkFreeBitmapClean()) << "The bulk free bit map isn't clean " << Dump();
-  uint32_t last_word_mask = GetBitmapLastVectorMask(num_slots, num_vec);
-  // Make sure all the bits at the end of the run are set so that we don't allocate there.
-  CHECK_EQ(alloc_bit_map_[num_vec - 1] & last_word_mask, last_word_mask);
-  // Ensure that the first bitmap index is valid.
-  CHECK_LT(first_search_vec_idx_, num_vec);
+  // Check that the bulk free list is empty. It's only used during BulkFree().
+  CHECK(IsBulkFreeListEmpty()) << "The bulk free isn't empty " << Dump();
   // Check the thread local runs, the current runs, and the run sets.
   if (IsThreadLocal()) {
     // If it's a thread local run, then it must be pointed to by an owner thread.
@@ -2059,11 +1911,11 @@
     }
     CHECK(owner_found) << "A thread local run has no owner thread " << Dump();
   } else {
-    // If it's not thread local, check that the thread local free bitmap is clean.
-    CHECK(IsThreadLocalFreeBitmapClean())
-        << "A non-thread-local run's thread local free bitmap isn't clean "
+    // If it's not thread local, check that the thread local free list is empty.
+    CHECK(IsThreadLocalFreeListEmpty())
+        << "A non-thread-local run's thread local free list isn't empty "
         << Dump();
-    // Check if it's a current run for the size bucket.
+    // Check if it's a current run for the size bracket.
     bool is_current_run = false;
     for (size_t i = 0; i < kNumOfSizeBrackets; i++) {
       MutexLock mu(self, *rosalloc->size_bracket_locks_[i]);
@@ -2101,34 +1953,39 @@
     }
   }
   // Check each slot.
-  size_t slots = 0;
   size_t memory_tool_modifier = running_on_memory_tool ?
       2 * ::art::gc::space::kDefaultMemoryToolRedZoneBytes :
       0U;
-  for (size_t v = 0; v < num_vec; v++, slots += 32) {
-    DCHECK_GE(num_slots, slots) << "Out of bounds";
-    uint32_t vec = alloc_bit_map_[v];
-    uint32_t thread_local_free_vec = ThreadLocalFreeBitMap()[v];
-    size_t end = std::min(num_slots - slots, static_cast<size_t>(32));
-    for (size_t i = 0; i < end; ++i) {
-      bool is_allocated = ((vec >> i) & 0x1) != 0;
-      // If a thread local run, slots may be marked freed in the
-      // thread local free bitmap.
-      bool is_thread_local_freed = IsThreadLocal() && ((thread_local_free_vec >> i) & 0x1) != 0;
-      if (is_allocated && !is_thread_local_freed) {
-        uint8_t* slot_addr = slot_base + (slots + i) * bracket_size;
-        if (running_on_memory_tool) {
-          slot_addr += ::art::gc::space::kDefaultMemoryToolRedZoneBytes;
-        }
-        mirror::Object* obj = reinterpret_cast<mirror::Object*>(slot_addr);
-        size_t obj_size = obj->SizeOf();
-        CHECK_LE(obj_size + memory_tool_modifier, kLargeSizeThreshold)
-            << "A run slot contains a large object " << Dump();
-        CHECK_EQ(SizeToIndex(obj_size + memory_tool_modifier), idx)
-            << PrettyTypeOf(obj) << " "
-            << "obj_size=" << obj_size << "(" << obj_size + memory_tool_modifier << "), idx=" << idx
-            << " A run slot contains an object with wrong size " << Dump();
-      }
+  // TODO: reuse InspectAllSlots().
+  std::unique_ptr<bool[]> is_free(new bool[num_slots]());  // zero initialized
+  // Mark the free slots and the remaining ones are allocated.
+  for (Slot* slot = free_list_.Head(); slot != nullptr; slot = slot->Next()) {
+    size_t slot_idx = SlotIndex(slot);
+    DCHECK_LT(slot_idx, num_slots);
+    is_free[slot_idx] = true;
+  }
+  if (IsThreadLocal()) {
+    for (Slot* slot = thread_local_free_list_.Head(); slot != nullptr; slot = slot->Next()) {
+      size_t slot_idx = SlotIndex(slot);
+      DCHECK_LT(slot_idx, num_slots);
+      is_free[slot_idx] = true;
+    }
+  }
+  for (size_t slot_idx = 0; slot_idx < num_slots; ++slot_idx) {
+    uint8_t* slot_addr = slot_base + slot_idx * bracket_size;
+    if (running_on_memory_tool) {
+      slot_addr += ::art::gc::space::kDefaultMemoryToolRedZoneBytes;
+    }
+    if (!is_free[slot_idx]) {
+      // The slot is allocated
+      mirror::Object* obj = reinterpret_cast<mirror::Object*>(slot_addr);
+      size_t obj_size = obj->SizeOf();
+      CHECK_LE(obj_size + memory_tool_modifier, kLargeSizeThreshold)
+          << "A run slot contains a large object " << Dump();
+      CHECK_EQ(SizeToIndex(obj_size + memory_tool_modifier), idx)
+          << PrettyTypeOf(obj) << " "
+          << "obj_size=" << obj_size << "(" << obj_size + memory_tool_modifier << "), idx=" << idx
+          << " A run slot contains an object with wrong size " << Dump();
     }
   }
 }
diff --git a/runtime/gc/allocator/rosalloc.h b/runtime/gc/allocator/rosalloc.h
index a7f29af..87f1392 100644
--- a/runtime/gc/allocator/rosalloc.h
+++ b/runtime/gc/allocator/rosalloc.h
@@ -112,6 +112,198 @@
     DISALLOW_COPY_AND_ASSIGN(FreePageRun);
   };
 
+  // The slot header.
+  class Slot {
+   public:
+    Slot* Next() const {
+      return next_;
+    }
+    void SetNext(Slot* next) {
+      next_ = next;
+    }
+    // The slot right before this slot in terms of the address.
+    Slot* Left(size_t bracket_size) {
+      return reinterpret_cast<Slot*>(reinterpret_cast<uintptr_t>(this) - bracket_size);
+    }
+    void Clear() {
+      next_ = nullptr;
+    }
+
+   private:
+    Slot* next_;  // Next slot in the list.
+  };
+
+  // We use the tail (kUseTail == true) for the bulk or thread-local free lists to avoid the need to
+  // traverse the list from the head to the tail when merging free lists.
+  // We don't use the tail (kUseTail == false) for the free list to avoid the need to manage the
+  // tail in the allocation fast path for a performance reason.
+  template<bool kUseTail = true>
+  class SlotFreeList {
+   public:
+    SlotFreeList() : head_(0U), tail_(0), size_(0) {}
+    Slot* Head() const {
+      return reinterpret_cast<Slot*>(head_);
+    }
+    Slot* Tail() const {
+      CHECK(kUseTail);
+      return reinterpret_cast<Slot*>(tail_);
+    }
+    size_t Size() const {
+      return size_;
+    }
+    // Removes from the head of the free list.
+    Slot* Remove() {
+      Slot* slot;
+      if (kIsDebugBuild) {
+        Verify();
+      }
+      Slot** headp = reinterpret_cast<Slot**>(&head_);
+      Slot** tailp = kUseTail ? reinterpret_cast<Slot**>(&tail_) : nullptr;
+      Slot* old_head = *headp;
+      if (old_head == nullptr) {
+        // List was empty.
+        if (kUseTail) {
+          DCHECK(*tailp == nullptr);
+        }
+        return nullptr;
+      } else {
+        // List wasn't empty.
+        if (kUseTail) {
+          DCHECK(*tailp != nullptr);
+        }
+        Slot* old_head_next = old_head->Next();
+        slot = old_head;
+        *headp = old_head_next;
+        if (kUseTail && old_head_next == nullptr) {
+          // List becomes empty.
+          *tailp = nullptr;
+        }
+      }
+      slot->Clear();
+      --size_;
+      if (kIsDebugBuild) {
+        Verify();
+      }
+      return slot;
+    }
+    void Add(Slot* slot) {
+      if (kIsDebugBuild) {
+        Verify();
+      }
+      DCHECK(slot != nullptr);
+      Slot** headp = reinterpret_cast<Slot**>(&head_);
+      Slot** tailp = kUseTail ? reinterpret_cast<Slot**>(&tail_) : nullptr;
+      Slot* old_head = *headp;
+      if (old_head == nullptr) {
+        // List was empty.
+        if (kUseTail) {
+          DCHECK(*tailp == nullptr);
+        }
+        *headp = slot;
+        if (kUseTail) {
+          *tailp = slot;
+        }
+      } else {
+        // List wasn't empty.
+        if (kUseTail) {
+          DCHECK(*tailp != nullptr);
+        }
+        *headp = slot;
+        slot->SetNext(old_head);
+      }
+      ++size_;
+      if (kIsDebugBuild) {
+        Verify();
+      }
+    }
+    // Merge the given list into this list. Empty the given list.
+    // Deliberately support only a kUseTail == true SlotFreeList parameter because 1) we don't
+    // currently have a situation where we need a kUseTail == false SlotFreeList parameter, and 2)
+    // supporting the kUseTail == false parameter would require a O(n) linked list traversal to do
+    // the merge if 'this' SlotFreeList has kUseTail == false, which we'd like to avoid.
+    void Merge(SlotFreeList<true>* list) {
+      if (kIsDebugBuild) {
+        Verify();
+        CHECK(list != nullptr);
+        list->Verify();
+      }
+      if (list->Size() == 0) {
+        return;
+      }
+      Slot** headp = reinterpret_cast<Slot**>(&head_);
+      Slot** tailp = kUseTail ? reinterpret_cast<Slot**>(&tail_) : nullptr;
+      Slot* old_head = *headp;
+      if (old_head == nullptr) {
+        // List was empty.
+        *headp = list->Head();
+        if (kUseTail) {
+          *tailp = list->Tail();
+        }
+        size_ = list->Size();
+      } else {
+        // List wasn't empty.
+        DCHECK(list->Head() != nullptr);
+        *headp = list->Head();
+        DCHECK(list->Tail() != nullptr);
+        list->Tail()->SetNext(old_head);
+        // if kUseTail, no change to tailp.
+        size_ += list->Size();
+      }
+      list->Reset();
+      if (kIsDebugBuild) {
+        Verify();
+      }
+    }
+
+    void Reset() {
+      head_ = 0;
+      if (kUseTail) {
+        tail_ = 0;
+      }
+      size_ = 0;
+    }
+
+    void Verify() {
+      Slot* head = reinterpret_cast<Slot*>(head_);
+      Slot* tail = kUseTail ? reinterpret_cast<Slot*>(tail_) : nullptr;
+      if (size_ == 0) {
+        CHECK(head == nullptr);
+        if (kUseTail) {
+          CHECK(tail == nullptr);
+        }
+      } else {
+        CHECK(head != nullptr);
+        if (kUseTail) {
+          CHECK(tail != nullptr);
+        }
+        size_t count = 0;
+        for (Slot* slot = head; slot != nullptr; slot = slot->Next()) {
+          ++count;
+          if (kUseTail && slot->Next() == nullptr) {
+            CHECK_EQ(slot, tail);
+          }
+        }
+        CHECK_EQ(size_, count);
+      }
+    }
+
+   private:
+    // A pointer (Slot*) to the head of the list. Always 8 bytes so that we will have the same
+    // layout between 32 bit and 64 bit, which is not strictly necessary, but we do so for 1)
+    // uniformity, 2) we won't need to change this code if we move to a non-low 4G heap in the
+    // future, and 3) the space savings by using 32 bit fields in 32 bit would be lost in noise
+    // (won't open up enough space to cause an extra slot to be available).
+    uint64_t head_;
+    // A pointer (Slot*) to the tail of the list. Always 8 bytes so that we will have the same
+    // layout between 32 bit and 64 bit. The tail is stored to speed up merging of lists.
+    // Unused if kUseTail is false.
+    uint64_t tail_;
+    // The number of slots in the list. This is used to make it fast to check if a free list is all
+    // free without traversing the whole free list.
+    uint32_t size_;
+    uint32_t padding_ ATTRIBUTE_UNUSED;
+  };
+
   // Represents a run of memory slots of the same size.
   //
   // A run's memory layout:
@@ -125,19 +317,17 @@
   // +-------------------+
   // | to_be_bulk_freed  |
   // +-------------------+
-  // | top_bitmap_idx    |
-  // +-------------------+
   // |                   |
-  // | alloc bit map     |
+  // | free list         |
   // |                   |
   // +-------------------+
   // |                   |
-  // | bulk free bit map |
+  // | bulk free list    |
   // |                   |
   // +-------------------+
   // |                   |
   // | thread-local free |
-  // | bit map           |
+  // | list              |
   // |                   |
   // +-------------------+
   // | padding due to    |
@@ -160,94 +350,100 @@
     uint8_t size_bracket_idx_;          // The index of the size bracket of this run.
     uint8_t is_thread_local_;           // True if this run is used as a thread-local run.
     uint8_t to_be_bulk_freed_;          // Used within BulkFree() to flag a run that's involved with a bulk free.
-    uint32_t first_search_vec_idx_;  // The index of the first bitmap vector which may contain an available slot.
-    uint32_t alloc_bit_map_[0];      // The bit map that allocates if each slot is in use.
+    uint32_t padding_ ATTRIBUTE_UNUSED;
+    // Use a tailless free list for free_list_ so that the alloc fast path does not manage the tail.
+    SlotFreeList<false> free_list_;
+    SlotFreeList<true> bulk_free_list_;
+    SlotFreeList<true> thread_local_free_list_;
+    // Padding due to alignment
+    // Slot 0
+    // Slot 1
+    // ...
 
-    // bulk_free_bit_map_[] : The bit map that is used for GC to
-    // temporarily mark the slots to free without using a lock. After
-    // all the slots to be freed in a run are marked, all those slots
-    // get freed in bulk with one locking per run, as opposed to one
-    // locking per slot to minimize the lock contention. This is used
-    // within BulkFree().
-
-    // thread_local_free_bit_map_[] : The bit map that is used for GC
-    // to temporarily mark the slots to free in a thread-local run
-    // without using a lock (without synchronizing the thread that
-    // owns the thread-local run.) When the thread-local run becomes
-    // full, the thread will check this bit map and update the
-    // allocation bit map of the run (that is, the slots get freed.)
-
-    // Returns the byte size of the header except for the bit maps.
+    // Returns the byte size of the header.
     static size_t fixed_header_size() {
-      Run temp;
-      size_t size = reinterpret_cast<uint8_t*>(&temp.alloc_bit_map_) - reinterpret_cast<uint8_t*>(&temp);
-      DCHECK_EQ(size, static_cast<size_t>(8));
-      return size;
+      return sizeof(Run);
     }
-    // Returns the base address of the free bit map.
-    uint32_t* BulkFreeBitMap() {
-      return reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(this) + bulkFreeBitMapOffsets[size_bracket_idx_]);
+    Slot* FirstSlot() {
+      const uint8_t idx = size_bracket_idx_;
+      return reinterpret_cast<Slot*>(reinterpret_cast<uintptr_t>(this) + headerSizes[idx]);
     }
-    // Returns the base address of the thread local free bit map.
-    uint32_t* ThreadLocalFreeBitMap() {
-      return reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(this) + threadLocalFreeBitMapOffsets[size_bracket_idx_]);
+    Slot* LastSlot() {
+      const uint8_t idx = size_bracket_idx_;
+      const size_t bracket_size = bracketSizes[idx];
+      uintptr_t end = reinterpret_cast<uintptr_t>(End());
+      Slot* last_slot = reinterpret_cast<Slot*>(end - bracket_size);
+      DCHECK_LE(FirstSlot(), last_slot);
+      return last_slot;
+    }
+    SlotFreeList<false>* FreeList() {
+      return &free_list_;
+    }
+    SlotFreeList<true>* BulkFreeList() {
+      return &bulk_free_list_;
+    }
+    SlotFreeList<true>* ThreadLocalFreeList() {
+      return &thread_local_free_list_;
     }
     void* End() {
       return reinterpret_cast<uint8_t*>(this) + kPageSize * numOfPages[size_bracket_idx_];
     }
-    // Returns the number of bitmap words per run.
-    size_t NumberOfBitmapVectors() const {
-      return RoundUp(numOfSlots[size_bracket_idx_], 32) / 32;
-    }
     void SetIsThreadLocal(bool is_thread_local) {
       is_thread_local_  = is_thread_local ? 1 : 0;
     }
     bool IsThreadLocal() const {
       return is_thread_local_ != 0;
     }
-    // Frees slots in the allocation bit map with regard to the
-    // thread-local free bit map. Used when a thread-local run becomes
+    // Set up the free list for a new/empty run.
+    void InitFreeList() {
+      const uint8_t idx = size_bracket_idx_;
+      const size_t bracket_size = bracketSizes[idx];
+      Slot* first_slot = FirstSlot();
+      // Add backwards so the first slot is at the head of the list.
+      for (Slot* slot = LastSlot(); slot >= first_slot; slot = slot->Left(bracket_size)) {
+        free_list_.Add(slot);
+      }
+    }
+    // Merge the thread local free list to the free list.  Used when a thread-local run becomes
     // full.
-    bool MergeThreadLocalFreeBitMapToAllocBitMap(bool* is_all_free_after_out);
-    // Frees slots in the allocation bit map with regard to the bulk
-    // free bit map. Used in a bulk free.
-    void MergeBulkFreeBitMapIntoAllocBitMap();
-    // Unions the slots to be freed in the free bit map into the
-    // thread-local free bit map. In a bulk free, as a two-step
-    // process, GC will first record all the slots to free in a run in
-    // the free bit map where it can write without a lock, and later
-    // acquire a lock once per run to union the bits of the free bit
-    // map to the thread-local free bit map.
-    void UnionBulkFreeBitMapToThreadLocalFreeBitMap();
+    bool MergeThreadLocalFreeListToFreeList(bool* is_all_free_after_out);
+    // Merge the bulk free list to the free list. Used in a bulk free.
+    void MergeBulkFreeListToFreeList();
+    // Merge the bulk free list to the thread local free list. In a bulk free, as a two-step
+    // process, GC will first record all the slots to free in a run in the bulk free list where it
+    // can write without a lock, and later acquire a lock once per run to merge the bulk free list
+    // to the thread-local free list.
+    void MergeBulkFreeListToThreadLocalFreeList();
     // Allocates a slot in a run.
-    void* AllocSlot();
+    ALWAYS_INLINE void* AllocSlot();
     // Frees a slot in a run. This is used in a non-bulk free.
     void FreeSlot(void* ptr);
-    // Marks the slots to free in the bulk free bit map. Returns the bracket size.
-    size_t MarkBulkFreeBitMap(void* ptr);
-    // Marks the slots to free in the thread-local free bit map.
-    void MarkThreadLocalFreeBitMap(void* ptr);
-    // Last word mask, all of the bits in the last word which aren't valid slots are set to
-    // optimize allocation path.
-    static uint32_t GetBitmapLastVectorMask(size_t num_slots, size_t num_vec);
+    // Add the given slot to the bulk free list. Returns the bracket size.
+    size_t AddToBulkFreeList(void* ptr);
+    // Add the given slot to the thread-local free list.
+    void AddToThreadLocalFreeList(void* ptr);
     // Returns true if all the slots in the run are not in use.
-    bool IsAllFree();
+    bool IsAllFree() const {
+      return free_list_.Size() == numOfSlots[size_bracket_idx_];
+    }
     // Returns the number of free slots.
-    size_t NumberOfFreeSlots();
+    size_t NumberOfFreeSlots() {
+      return free_list_.Size();
+    }
     // Returns true if all the slots in the run are in use.
     ALWAYS_INLINE bool IsFull();
-    // Returns true if the bulk free bit map is clean.
-    bool IsBulkFreeBitmapClean();
-    // Returns true if the thread local free bit map is clean.
-    bool IsThreadLocalFreeBitmapClean();
-    // Set the alloc_bit_map_ bits for slots that are past the end of the run.
-    void SetAllocBitMapBitsForInvalidSlots();
+    // Returns true if the bulk free list is empty.
+    bool IsBulkFreeListEmpty() const {
+      return bulk_free_list_.Size() == 0;
+    }
+    // Returns true if the thread local free list is empty.
+    bool IsThreadLocalFreeListEmpty() const {
+      return thread_local_free_list_.Size() == 0;
+    }
     // Zero the run's data.
     void ZeroData();
-    // Zero the run's header.
-    void ZeroHeader();
-    // Fill the alloc bitmap with 1s.
-    void FillAllocBitMap();
+    // Zero the run's header and the slot headers.
+    void ZeroHeaderAndSlotHeaders();
     // Iterate over all the slots and apply the given function.
     void InspectAllSlots(void (*handler)(void* start, void* end, size_t used_bytes, void* callback_arg), void* arg);
     // Dump the run metadata for debugging.
@@ -258,11 +454,24 @@
         REQUIRES(Locks::thread_list_lock_);
 
    private:
-    // The common part of MarkFreeBitMap() and MarkThreadLocalFreeBitMap(). Returns the bracket
+    // The common part of AddToBulkFreeList() and AddToThreadLocalFreeList(). Returns the bracket
     // size.
-    size_t MarkFreeBitMapShared(void* ptr, uint32_t* free_bit_map_base, const char* caller_name);
-    // Turns the bit map into a string for debugging.
-    static std::string BitMapToStr(uint32_t* bit_map_base, size_t num_vec);
+    size_t AddToFreeListShared(void* ptr, SlotFreeList<true>* free_list, const char* caller_name);
+    // Turns a FreeList into a string for debugging.
+    template<bool kUseTail>
+    std::string FreeListToStr(SlotFreeList<kUseTail>* free_list);
+    // Check a given pointer is a valid slot address and return it as Slot*.
+    Slot* ToSlot(void* ptr) {
+      const uint8_t idx = size_bracket_idx_;
+      const size_t bracket_size = bracketSizes[idx];
+      const size_t offset_from_slot_base = reinterpret_cast<uint8_t*>(ptr)
+          - reinterpret_cast<uint8_t*>(FirstSlot());
+      DCHECK_EQ(offset_from_slot_base % bracket_size, static_cast<size_t>(0));
+      size_t slot_idx = offset_from_slot_base / bracket_size;
+      DCHECK_LT(slot_idx, numOfSlots[idx]);
+      return reinterpret_cast<Slot*>(ptr);
+    }
+    size_t SlotIndex(Slot* slot);
 
     // TODO: DISALLOW_COPY_AND_ASSIGN(Run);
   };
@@ -283,10 +492,6 @@
   static size_t numOfSlots[kNumOfSizeBrackets];
   // The header sizes in bytes of the runs for each size bracket.
   static size_t headerSizes[kNumOfSizeBrackets];
-  // The byte offsets of the bulk free bit maps of the runs for each size bracket.
-  static size_t bulkFreeBitMapOffsets[kNumOfSizeBrackets];
-  // The byte offsets of the thread-local free bit maps of the runs for each size bracket.
-  static size_t threadLocalFreeBitMapOffsets[kNumOfSizeBrackets];
 
   // Initialize the run specs (the above arrays).
   static void Initialize();
@@ -493,7 +698,7 @@
   // The reader-writer lock to allow one bulk free at a time while
   // allowing multiple individual frees at the same time. Also, this
   // is used to avoid race conditions between BulkFree() and
-  // RevokeThreadLocalRuns() on the bulk free bitmaps.
+  // RevokeThreadLocalRuns() on the bulk free list.
   ReaderWriterMutex bulk_free_lock_ DEFAULT_MUTEX_ACQUIRED_AFTER;
 
   // The page release mode.
diff --git a/runtime/jit/jit_instrumentation.cc b/runtime/jit/jit_instrumentation.cc
index 4f4a97f..e9c16c1 100644
--- a/runtime/jit/jit_instrumentation.cc
+++ b/runtime/jit/jit_instrumentation.cc
@@ -115,30 +115,8 @@
   }
 }
 
-class WaitForCompilationToFinishTask FINAL : public Task {
- public:
-  WaitForCompilationToFinishTask() : barrier_(0) {}
-
-  void Wait(Thread* self) {
-    barrier_.Increment(self, 1);
-  }
-
-  void Run(Thread* self ATTRIBUTE_UNUSED) OVERRIDE {}
-
-  void Finalize() OVERRIDE {
-    // Do this in Finalize since Finalize is called after Run by the thread pool.
-    barrier_.Pass(Thread::Current());
-  }
-
- private:
-  Barrier barrier_;
-  DISALLOW_COPY_AND_ASSIGN(WaitForCompilationToFinishTask);
-};
-
 void JitInstrumentationCache::WaitForCompilationToFinish(Thread* self) {
-  std::unique_ptr<WaitForCompilationToFinishTask> task(new WaitForCompilationToFinishTask);
-  thread_pool_->AddTask(self, task.get());
-  task->Wait(self);
+  thread_pool_->Wait(self, false, false);
 }
 
 }  // namespace jit
diff --git a/runtime/mirror/throwable.cc b/runtime/mirror/throwable.cc
index e8633de..e215994 100644
--- a/runtime/mirror/throwable.cc
+++ b/runtime/mirror/throwable.cc
@@ -71,18 +71,14 @@
 
 int32_t Throwable::GetStackDepth() {
   Object* stack_state = GetStackState();
-  if (stack_state == nullptr) {
+  if (stack_state == nullptr || !stack_state->IsObjectArray()) {
     return -1;
   }
-  if (!stack_state->IsIntArray() && !stack_state->IsLongArray()) {
-    return -1;
-  }
-  mirror::PointerArray* method_trace = down_cast<mirror::PointerArray*>(stack_state->AsArray());
-  int32_t array_len = method_trace->GetLength();
-  // The format is [method pointers][pcs] so the depth is half the length (see method
-  // BuildInternalStackTraceVisitor::Init).
-  CHECK_EQ(array_len % 2, 0);
-  return array_len / 2;
+  mirror::ObjectArray<mirror::Object>* const trace = stack_state->AsObjectArray<mirror::Object>();
+  const int32_t array_len = trace->GetLength();
+  DCHECK_GT(array_len, 0);
+  // See method BuildInternalStackTraceVisitor::Init for the format.
+  return array_len - 1;
 }
 
 std::string Throwable::Dump() {
@@ -95,18 +91,22 @@
   result += "\n";
   Object* stack_state = GetStackState();
   // check stack state isn't missing or corrupt
-  if (stack_state != nullptr &&
-      (stack_state->IsIntArray() || stack_state->IsLongArray())) {
+  if (stack_state != nullptr && stack_state->IsObjectArray()) {
+    mirror::ObjectArray<mirror::Object>* object_array =
+        stack_state->AsObjectArray<mirror::Object>();
     // Decode the internal stack trace into the depth and method trace
-    // Format is [method pointers][pcs]
-    auto* method_trace = down_cast<mirror::PointerArray*>(stack_state->AsArray());
-    auto array_len = method_trace->GetLength();
+    // See method BuildInternalStackTraceVisitor::Init for the format.
+    DCHECK_GT(object_array->GetLength(), 0);
+    mirror::Object* methods_and_dex_pcs = object_array->Get(0);
+    DCHECK(methods_and_dex_pcs->IsIntArray() || methods_and_dex_pcs->IsLongArray());
+    mirror::PointerArray* method_trace = down_cast<mirror::PointerArray*>(methods_and_dex_pcs);
+    const int32_t array_len = method_trace->GetLength();
     CHECK_EQ(array_len % 2, 0);
     const auto depth = array_len / 2;
     if (depth == 0) {
       result += "(Throwable with empty stack trace)";
     } else {
-      auto ptr_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
+      const size_t ptr_size = Runtime::Current()->GetClassLinker()->GetImagePointerSize();
       for (int32_t i = 0; i < depth; ++i) {
         ArtMethod* method = method_trace->GetElementPtrSize<ArtMethod*>(i, ptr_size);
         uintptr_t dex_pc = method_trace->GetElementPtrSize<uintptr_t>(i + depth, ptr_size);
diff --git a/runtime/runtime.cc b/runtime/runtime.cc
index 8cba1a9..1f447d0 100644
--- a/runtime/runtime.cc
+++ b/runtime/runtime.cc
@@ -1790,6 +1790,9 @@
 }
 
 LinearAlloc* Runtime::CreateLinearAlloc() {
+  // For 64 bit compilers, it needs to be in low 4GB in the case where we are cross compiling for a
+  // 32 bit target. In this case, we have 32 bit pointers in the dex cache arrays which can't hold
+  // when we have 64 bit ArtMethod pointers.
   return (IsAotCompiler() && Is64BitInstructionSet(kRuntimeISA))
       ? new LinearAlloc(low_4gb_arena_pool_.get())
       : new LinearAlloc(arena_pool_.get());
diff --git a/runtime/thread.cc b/runtime/thread.cc
index 82e6fb0..65f71ef 100644
--- a/runtime/thread.cc
+++ b/runtime/thread.cc
@@ -1966,15 +1966,32 @@
         pointer_size_(Runtime::Current()->GetClassLinker()->GetImagePointerSize()) {}
 
   bool Init(int depth) SHARED_REQUIRES(Locks::mutator_lock_) ACQUIRE(Roles::uninterruptible_) {
-    // Allocate method trace with format [method pointers][pcs].
-    auto* cl = Runtime::Current()->GetClassLinker();
-    trace_ = cl->AllocPointerArray(self_, depth * 2);
-    const char* last_no_suspend_cause =
-        self_->StartAssertNoThreadSuspension("Building internal stack trace");
-    if (trace_ == nullptr) {
+    // Allocate method trace as an object array where the first element is a pointer array that
+    // contains the ArtMethod pointers and dex PCs. The rest of the elements are the declaring
+    // class of the ArtMethod pointers.
+    ClassLinker* class_linker = Runtime::Current()->GetClassLinker();
+    StackHandleScope<1> hs(self_);
+    mirror::Class* array_class = class_linker->GetClassRoot(ClassLinker::kObjectArrayClass);
+    // The first element is the methods and dex pc array, the other elements are declaring classes
+    // for the methods to ensure classes in the stack trace don't get unloaded.
+    Handle<mirror::ObjectArray<mirror::Object>> trace(
+        hs.NewHandle(
+            mirror::ObjectArray<mirror::Object>::Alloc(hs.Self(), array_class, depth + 1)));
+    if (trace.Get() == nullptr) {
+      // Acquire uninterruptible_ in all paths.
+      self_->StartAssertNoThreadSuspension("Building internal stack trace");
       self_->AssertPendingOOMException();
       return false;
     }
+    mirror::PointerArray* methods_and_pcs = class_linker->AllocPointerArray(self_, depth * 2);
+    const char* last_no_suspend_cause =
+        self_->StartAssertNoThreadSuspension("Building internal stack trace");
+    if (methods_and_pcs == nullptr) {
+      self_->AssertPendingOOMException();
+      return false;
+    }
+    trace->Set(0, methods_and_pcs);
+    trace_ = trace.Get();
     // If We are called from native, use non-transactional mode.
     CHECK(last_no_suspend_cause == nullptr) << last_no_suspend_cause;
     return true;
@@ -1996,16 +2013,24 @@
     if (m->IsRuntimeMethod()) {
       return true;  // Ignore runtime frames (in particular callee save).
     }
-    trace_->SetElementPtrSize<kTransactionActive>(
-        count_, m, pointer_size_);
-    trace_->SetElementPtrSize<kTransactionActive>(
-        trace_->GetLength() / 2 + count_, m->IsProxyMethod() ? DexFile::kDexNoIndex : GetDexPc(),
-            pointer_size_);
+    mirror::PointerArray* trace_methods_and_pcs = GetTraceMethodsAndPCs();
+    trace_methods_and_pcs->SetElementPtrSize<kTransactionActive>(count_, m, pointer_size_);
+    trace_methods_and_pcs->SetElementPtrSize<kTransactionActive>(
+        trace_methods_and_pcs->GetLength() / 2 + count_,
+        m->IsProxyMethod() ? DexFile::kDexNoIndex : GetDexPc(),
+        pointer_size_);
+    // Save the declaring class of the method to ensure that the declaring classes of the methods
+    // do not get unloaded while the stack trace is live.
+    trace_->Set(count_ + 1, m->GetDeclaringClass());
     ++count_;
     return true;
   }
 
-  mirror::PointerArray* GetInternalStackTrace() const {
+  mirror::PointerArray* GetTraceMethodsAndPCs() const SHARED_REQUIRES(Locks::mutator_lock_) {
+    return down_cast<mirror::PointerArray*>(trace_->Get(0));
+  }
+
+  mirror::ObjectArray<mirror::Object>* GetInternalStackTrace() const {
     return trace_;
   }
 
@@ -2015,8 +2040,11 @@
   int32_t skip_depth_;
   // Current position down stack trace.
   uint32_t count_;
-  // An array of the methods on the stack, the last entries are the dex PCs.
-  mirror::PointerArray* trace_;
+  // An object array where the first element is a pointer array that contains the ArtMethod
+  // pointers on the stack and dex PCs. The rest of the elements are the declaring
+  // class of the ArtMethod pointers. trace_[i+1] contains the declaring class of the ArtMethod of
+  // the i'th frame.
+  mirror::ObjectArray<mirror::Object>* trace_;
   // For cross compilation.
   const size_t pointer_size_;
 
@@ -2039,11 +2067,12 @@
     return nullptr;  // Allocation failed.
   }
   build_trace_visitor.WalkStack();
-  mirror::PointerArray* trace = build_trace_visitor.GetInternalStackTrace();
+  mirror::ObjectArray<mirror::Object>* trace = build_trace_visitor.GetInternalStackTrace();
   if (kIsDebugBuild) {
-    // Second half is dex PCs.
-    for (uint32_t i = 0; i < static_cast<uint32_t>(trace->GetLength() / 2); ++i) {
-      auto* method = trace->GetElementPtrSize<ArtMethod*>(
+    mirror::PointerArray* trace_methods = build_trace_visitor.GetTraceMethodsAndPCs();
+    // Second half of trace_methods is dex PCs.
+    for (uint32_t i = 0; i < static_cast<uint32_t>(trace_methods->GetLength() / 2); ++i) {
+      auto* method = trace_methods->GetElementPtrSize<ArtMethod*>(
           i, Runtime::Current()->GetClassLinker()->GetImagePointerSize());
       CHECK(method != nullptr);
     }
@@ -2062,12 +2091,16 @@
 }
 
 jobjectArray Thread::InternalStackTraceToStackTraceElementArray(
-    const ScopedObjectAccessAlreadyRunnable& soa, jobject internal, jobjectArray output_array,
+    const ScopedObjectAccessAlreadyRunnable& soa,
+    jobject internal,
+    jobjectArray output_array,
     int* stack_depth) {
-  // Decode the internal stack trace into the depth, method trace and PC trace
-  int32_t depth = soa.Decode<mirror::PointerArray*>(internal)->GetLength() / 2;
+  // Decode the internal stack trace into the depth, method trace and PC trace.
+  // Subtract one for the methods and PC trace.
+  int32_t depth = soa.Decode<mirror::Array*>(internal)->GetLength() - 1;
+  DCHECK_GE(depth, 0);
 
-  auto* cl = Runtime::Current()->GetClassLinker();
+  ClassLinker* const class_linker = Runtime::Current()->GetClassLinker();
 
   jobjectArray result;
 
@@ -2081,7 +2114,7 @@
   } else {
     // Create java_trace array and place in local reference table
     mirror::ObjectArray<mirror::StackTraceElement>* java_traces =
-        cl->AllocStackTraceElementArray(soa.Self(), depth);
+        class_linker->AllocStackTraceElementArray(soa.Self(), depth);
     if (java_traces == nullptr) {
       return nullptr;
     }
@@ -2093,7 +2126,12 @@
   }
 
   for (int32_t i = 0; i < depth; ++i) {
-    auto* method_trace = soa.Decode<mirror::PointerArray*>(internal);
+    mirror::ObjectArray<mirror::Object>* decoded_traces =
+        soa.Decode<mirror::Object*>(internal)->AsObjectArray<mirror::Object>();
+    // Methods and dex PC trace is element 0.
+    DCHECK(decoded_traces->Get(0)->IsIntArray() || decoded_traces->Get(0)->IsLongArray());
+    mirror::PointerArray* const method_trace =
+        down_cast<mirror::PointerArray*>(decoded_traces->Get(0));
     // Prepare parameters for StackTraceElement(String cls, String method, String file, int line)
     ArtMethod* method = method_trace->GetElementPtrSize<ArtMethod*>(i, sizeof(void*));
     uint32_t dex_pc = method_trace->GetElementPtrSize<uint32_t>(
diff --git a/test/141-class-unload/expected.txt b/test/141-class-unload/expected.txt
index ff65a70..53d7abe 100644
--- a/test/141-class-unload/expected.txt
+++ b/test/141-class-unload/expected.txt
@@ -16,3 +16,8 @@
 JNI_OnLoad called
 JNI_OnUnload called
 null
+1
+2
+JNI_OnLoad called
+class null false test
+JNI_OnUnload called
diff --git a/test/141-class-unload/src-ex/IntHolder.java b/test/141-class-unload/src-ex/IntHolder.java
index e4aa6b8..feff0d2 100644
--- a/test/141-class-unload/src-ex/IntHolder.java
+++ b/test/141-class-unload/src-ex/IntHolder.java
@@ -36,4 +36,8 @@
     }
 
     public static native void waitForCompilation();
+
+    public static Throwable generateStackTrace() {
+      return new Exception("test");
+    }
 }
diff --git a/test/141-class-unload/src/Main.java b/test/141-class-unload/src/Main.java
index 105a2b9..3cc43ac 100644
--- a/test/141-class-unload/src/Main.java
+++ b/test/141-class-unload/src/Main.java
@@ -39,6 +39,8 @@
             testNoUnloadInstance(constructor);
             // Test JNI_OnLoad and JNI_OnUnload.
             testLoadAndUnloadLibrary(constructor);
+            // Test that stack traces keep the classes live.
+            testStackTrace(constructor);
             // Stress test to make sure we dont leak memory.
             stressTest(constructor);
         } catch (Exception e) {
@@ -75,6 +77,16 @@
       System.out.println(loader.get());
     }
 
+    private static void testStackTrace(Constructor constructor) throws Exception {
+        WeakReference<Class> klass = setUpUnloadClass(constructor);
+        Method stackTraceMethod = klass.get().getDeclaredMethod("generateStackTrace");
+        Throwable throwable = (Throwable) stackTraceMethod.invoke(klass.get());
+        stackTraceMethod = null;
+        Runtime.getRuntime().gc();
+        boolean isNull = klass.get() == null;
+        System.out.println("class null " + isNull + " " + throwable.getMessage());
+    }
+
     private static void testLoadAndUnloadLibrary(Constructor constructor) throws Exception {
         WeakReference<ClassLoader> loader = setUpLoadLibrary(constructor);
         // No strong refernces to class loader, should get unloaded.
diff --git a/test/458-checker-instruction-simplification/src/Main.java b/test/458-checker-instruction-simplification/src/Main.java
index a14200e..c32d34a 100644
--- a/test/458-checker-instruction-simplification/src/Main.java
+++ b/test/458-checker-instruction-simplification/src/Main.java
@@ -84,6 +84,172 @@
     return arg & -1;
   }
 
+  /// CHECK-START: int Main.UShr28And15(int) instruction_simplifier (before)
+  /// CHECK-DAG:     <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const28:i\d+>>  IntConstant 28
+  /// CHECK-DAG:     <<Const15:i\d+>>  IntConstant 15
+  /// CHECK-DAG:     <<UShr:i\d+>>     UShr [<<Arg>>,<<Const28>>]
+  /// CHECK-DAG:     <<And:i\d+>>      And [<<UShr>>,<<Const15>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  /// CHECK-START: int Main.UShr28And15(int) instruction_simplifier (after)
+  /// CHECK-DAG:     <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const28:i\d+>>  IntConstant 28
+  /// CHECK-DAG:     <<UShr:i\d+>>     UShr [<<Arg>>,<<Const28>>]
+  /// CHECK-DAG:                       Return [<<UShr>>]
+
+  /// CHECK-START: int Main.UShr28And15(int) instruction_simplifier (after)
+  /// CHECK-NOT:                       And
+
+  public static int UShr28And15(int arg) {
+    return (arg >>> 28) & 15;
+  }
+
+  /// CHECK-START: long Main.UShr60And15(long) instruction_simplifier (before)
+  /// CHECK-DAG:     <<Arg:j\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const60:i\d+>>  IntConstant 60
+  /// CHECK-DAG:     <<Const15:j\d+>>  LongConstant 15
+  /// CHECK-DAG:     <<UShr:j\d+>>     UShr [<<Arg>>,<<Const60>>]
+  /// CHECK-DAG:     <<And:j\d+>>      And [<<UShr>>,<<Const15>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  /// CHECK-START: long Main.UShr60And15(long) instruction_simplifier (after)
+  /// CHECK-DAG:     <<Arg:j\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const60:i\d+>>  IntConstant 60
+  /// CHECK-DAG:     <<UShr:j\d+>>     UShr [<<Arg>>,<<Const60>>]
+  /// CHECK-DAG:                       Return [<<UShr>>]
+
+  /// CHECK-START: long Main.UShr60And15(long) instruction_simplifier (after)
+  /// CHECK-NOT:                       And
+
+  public static long UShr60And15(long arg) {
+    return (arg >>> 60) & 15;
+  }
+
+  /// CHECK-START: int Main.UShr28And7(int) instruction_simplifier (before)
+  /// CHECK-DAG:     <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const28:i\d+>>  IntConstant 28
+  /// CHECK-DAG:     <<Const7:i\d+>>   IntConstant 7
+  /// CHECK-DAG:     <<UShr:i\d+>>     UShr [<<Arg>>,<<Const28>>]
+  /// CHECK-DAG:     <<And:i\d+>>      And [<<UShr>>,<<Const7>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  /// CHECK-START: int Main.UShr28And7(int) instruction_simplifier (after)
+  /// CHECK-DAG:     <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const28:i\d+>>  IntConstant 28
+  /// CHECK-DAG:     <<Const7:i\d+>>   IntConstant 7
+  /// CHECK-DAG:     <<UShr:i\d+>>     UShr [<<Arg>>,<<Const28>>]
+  /// CHECK-DAG:     <<And:i\d+>>      And [<<UShr>>,<<Const7>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  public static int UShr28And7(int arg) {
+    return (arg >>> 28) & 7;
+  }
+
+  /// CHECK-START: long Main.UShr60And7(long) instruction_simplifier (before)
+  /// CHECK-DAG:     <<Arg:j\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const60:i\d+>>  IntConstant 60
+  /// CHECK-DAG:     <<Const7:j\d+>>   LongConstant 7
+  /// CHECK-DAG:     <<UShr:j\d+>>     UShr [<<Arg>>,<<Const60>>]
+  /// CHECK-DAG:     <<And:j\d+>>      And [<<UShr>>,<<Const7>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  /// CHECK-START: long Main.UShr60And7(long) instruction_simplifier (after)
+  /// CHECK-DAG:     <<Arg:j\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const60:i\d+>>  IntConstant 60
+  /// CHECK-DAG:     <<Const7:j\d+>>   LongConstant 7
+  /// CHECK-DAG:     <<UShr:j\d+>>     UShr [<<Arg>>,<<Const60>>]
+  /// CHECK-DAG:     <<And:j\d+>>      And [<<UShr>>,<<Const7>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  public static long UShr60And7(long arg) {
+    return (arg >>> 60) & 7;
+  }
+
+  /// CHECK-START: int Main.Shr24And255(int) instruction_simplifier (before)
+  /// CHECK-DAG:     <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const24:i\d+>>  IntConstant 24
+  /// CHECK-DAG:     <<Const255:i\d+>> IntConstant 255
+  /// CHECK-DAG:     <<Shr:i\d+>>      Shr [<<Arg>>,<<Const24>>]
+  /// CHECK-DAG:     <<And:i\d+>>      And [<<Shr>>,<<Const255>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  /// CHECK-START: int Main.Shr24And255(int) instruction_simplifier (after)
+  /// CHECK-DAG:     <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const24:i\d+>>  IntConstant 24
+  /// CHECK-DAG:     <<UShr:i\d+>>     UShr [<<Arg>>,<<Const24>>]
+  /// CHECK-DAG:                       Return [<<UShr>>]
+
+  /// CHECK-START: int Main.Shr24And255(int) instruction_simplifier (after)
+  /// CHECK-NOT:                       Shr
+  /// CHECK-NOT:                       And
+
+  public static int Shr24And255(int arg) {
+    return (arg >> 24) & 255;
+  }
+
+  /// CHECK-START: long Main.Shr56And255(long) instruction_simplifier (before)
+  /// CHECK-DAG:     <<Arg:j\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const56:i\d+>>  IntConstant 56
+  /// CHECK-DAG:     <<Const255:j\d+>> LongConstant 255
+  /// CHECK-DAG:     <<Shr:j\d+>>      Shr [<<Arg>>,<<Const56>>]
+  /// CHECK-DAG:     <<And:j\d+>>      And [<<Shr>>,<<Const255>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  /// CHECK-START: long Main.Shr56And255(long) instruction_simplifier (after)
+  /// CHECK-DAG:     <<Arg:j\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const56:i\d+>>  IntConstant 56
+  /// CHECK-DAG:     <<UShr:j\d+>>     UShr [<<Arg>>,<<Const56>>]
+  /// CHECK-DAG:                       Return [<<UShr>>]
+
+  /// CHECK-START: long Main.Shr56And255(long) instruction_simplifier (after)
+  /// CHECK-NOT:                       Shr
+  /// CHECK-NOT:                       And
+
+  public static long Shr56And255(long arg) {
+    return (arg >> 56) & 255;
+  }
+
+  /// CHECK-START: int Main.Shr24And127(int) instruction_simplifier (before)
+  /// CHECK-DAG:     <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const24:i\d+>>  IntConstant 24
+  /// CHECK-DAG:     <<Const127:i\d+>> IntConstant 127
+  /// CHECK-DAG:     <<Shr:i\d+>>      Shr [<<Arg>>,<<Const24>>]
+  /// CHECK-DAG:     <<And:i\d+>>      And [<<Shr>>,<<Const127>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  /// CHECK-START: int Main.Shr24And127(int) instruction_simplifier (after)
+  /// CHECK-DAG:     <<Arg:i\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const24:i\d+>>  IntConstant 24
+  /// CHECK-DAG:     <<Const127:i\d+>> IntConstant 127
+  /// CHECK-DAG:     <<Shr:i\d+>>      Shr [<<Arg>>,<<Const24>>]
+  /// CHECK-DAG:     <<And:i\d+>>      And [<<Shr>>,<<Const127>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  public static int Shr24And127(int arg) {
+    return (arg >> 24) & 127;
+  }
+
+  /// CHECK-START: long Main.Shr56And127(long) instruction_simplifier (before)
+  /// CHECK-DAG:     <<Arg:j\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const56:i\d+>>  IntConstant 56
+  /// CHECK-DAG:     <<Const127:j\d+>> LongConstant 127
+  /// CHECK-DAG:     <<Shr:j\d+>>      Shr [<<Arg>>,<<Const56>>]
+  /// CHECK-DAG:     <<And:j\d+>>      And [<<Shr>>,<<Const127>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  /// CHECK-START: long Main.Shr56And127(long) instruction_simplifier (after)
+  /// CHECK-DAG:     <<Arg:j\d+>>      ParameterValue
+  /// CHECK-DAG:     <<Const56:i\d+>>  IntConstant 56
+  /// CHECK-DAG:     <<Const127:j\d+>> LongConstant 127
+  /// CHECK-DAG:     <<Shr:j\d+>>      Shr [<<Arg>>,<<Const56>>]
+  /// CHECK-DAG:     <<And:j\d+>>      And [<<Shr>>,<<Const127>>]
+  /// CHECK-DAG:                       Return [<<And>>]
+
+  public static long Shr56And127(long arg) {
+    return (arg >> 56) & 127;
+  }
+
   /// CHECK-START: long Main.Div1(long) instruction_simplifier (before)
   /// CHECK-DAG:     <<Arg:j\d+>>     ParameterValue
   /// CHECK-DAG:     <<Const1:j\d+>>  LongConstant 1
@@ -1109,5 +1275,13 @@
     assertFloatEquals(DivMP25(100.0f), -400.0f);
     assertDoubleEquals(DivMP25(150.0), -600.0);
     assertLongEquals(Shl1(100), 200);
+    assertIntEquals(UShr28And15(0xc1234567), 0xc);
+    assertLongEquals(UShr60And15(0xc123456787654321L), 0xcL);
+    assertIntEquals(UShr28And7(0xc1234567), 0x4);
+    assertLongEquals(UShr60And7(0xc123456787654321L), 0x4L);
+    assertIntEquals(Shr24And255(0xc1234567), 0xc1);
+    assertLongEquals(Shr56And255(0xc123456787654321L), 0xc1L);
+    assertIntEquals(Shr24And127(0xc1234567), 0x41);
+    assertLongEquals(Shr56And127(0xc123456787654321L), 0x41L);
   }
 }
diff --git a/test/466-get-live-vreg/get_live_vreg_jni.cc b/test/466-get-live-vreg/get_live_vreg_jni.cc
index e3e0091..7e9a583 100644
--- a/test/466-get-live-vreg/get_live_vreg_jni.cc
+++ b/test/466-get-live-vreg/get_live_vreg_jni.cc
@@ -42,7 +42,9 @@
     } else if (m_name.compare("testIntervalHole") == 0) {
       found_method_ = true;
       uint32_t value = 0;
-      if (GetCurrentQuickFrame() != nullptr && m->IsOptimized(sizeof(void*))) {
+      if (GetCurrentQuickFrame() != nullptr &&
+          m->IsOptimized(sizeof(void*)) &&
+          !Runtime::Current()->IsDebuggable()) {
         CHECK_EQ(GetVReg(m, 0, kIntVReg, &value), false);
       } else {
         CHECK(GetVReg(m, 0, kIntVReg, &value));
diff --git a/test/530-checker-loops/src/Main.java b/test/530-checker-loops/src/Main.java
index 1c5b5d6..58c92f1 100644
--- a/test/530-checker-loops/src/Main.java
+++ b/test/530-checker-loops/src/Main.java
@@ -22,7 +22,7 @@
   static int sResult;
 
   //
-  // Various sequence variables where bound checks can be removed from loop.
+  // Various sequence variables used in bound checks.
   //
 
   /// CHECK-START: int Main.linear(int[]) BCE (before)
@@ -262,11 +262,11 @@
     return result;
   }
 
-  /// CHECK-START: int Main.linearForNE() BCE (before)
+  /// CHECK-START: int Main.linearForNEUp() BCE (before)
   /// CHECK-DAG: BoundsCheck
-  /// CHECK-START: int Main.linearForNE() BCE (after)
+  /// CHECK-START: int Main.linearForNEUp() BCE (after)
   /// CHECK-NOT: BoundsCheck
-  private static int linearForNE() {
+  private static int linearForNEUp() {
     int[] x = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
     int result = 0;
     for (int i = 0; i != 10; i++) {
@@ -275,21 +275,47 @@
     return result;
   }
 
-  /// CHECK-START: int Main.linearDoWhile() BCE (before)
+  /// CHECK-START: int Main.linearForNEDown() BCE (before)
   /// CHECK-DAG: BoundsCheck
-  /// CHECK-START: int Main.linearDoWhile() BCE (after)
+  /// CHECK-START: int Main.linearForNEDown() BCE (after)
+  /// CHECK-NOT: BoundsCheck
+  private static int linearForNEDown() {
+    int[] x = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+    int result = 0;
+    for (int i = 9; i != -1; i--) {
+      result += x[i];
+    }
+    return result;
+  }
+
+  /// CHECK-START: int Main.linearDoWhileUp() BCE (before)
   /// CHECK-DAG: BoundsCheck
-  private static int linearDoWhile() {
+  /// CHECK-START: int Main.linearDoWhileUp() BCE (after)
+  /// CHECK-NOT: BoundsCheck
+  private static int linearDoWhileUp() {
     int[] x = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
     int result = 0;
     int i = 0;
-    // TODO: make this work
     do {
       result += x[i++];
     } while (i < 10);
     return result;
   }
 
+  /// CHECK-START: int Main.linearDoWhileDown() BCE (before)
+  /// CHECK-DAG: BoundsCheck
+  /// CHECK-START: int Main.linearDoWhileDown() BCE (after)
+  /// CHECK-NOT: BoundsCheck
+  private static int linearDoWhileDown() {
+    int[] x = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+    int result = 0;
+    int i = 9;
+    do {
+      result += x[i--];
+    } while (0 <= i);
+    return result;
+  }
+
   /// CHECK-START: int Main.linearShort() BCE (before)
   /// CHECK-DAG: BoundsCheck
   /// CHECK-START: int Main.linearShort() BCE (after)
@@ -471,23 +497,50 @@
     return result;
   }
 
-  //
-  // Cases that actually go out of bounds. These test cases
-  // ensure the exceptions are thrown at the right places.
-  //
-
+  /// CHECK-START: void Main.lowerOOB(int[]) BCE (before)
+  /// CHECK-DAG: BoundsCheck
+  /// CHECK-START: void Main.lowerOOB(int[]) BCE (after)
+  /// CHECK-DAG: BoundsCheck
   private static void lowerOOB(int[] x) {
     for (int i = -1; i < x.length; i++) {
       sResult += x[i];
     }
   }
 
+  /// CHECK-START: void Main.upperOOB(int[]) BCE (before)
+  /// CHECK-DAG: BoundsCheck
+  /// CHECK-START: void Main.upperOOB(int[]) BCE (after)
+  /// CHECK-DAG: BoundsCheck
   private static void upperOOB(int[] x) {
     for (int i = 0; i <= x.length; i++) {
       sResult += x[i];
     }
   }
 
+  /// CHECK-START: void Main.doWhileUpOOB() BCE (before)
+  /// CHECK-DAG: BoundsCheck
+  /// CHECK-START: void Main.doWhileUpOOB() BCE (after)
+  /// CHECK-DAG: BoundsCheck
+  private static void doWhileUpOOB() {
+    int[] x = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+    int i = 0;
+    do {
+      sResult += x[i++];
+    } while (i <= x.length);
+  }
+
+  /// CHECK-START: void Main.doWhileDownOOB() BCE (before)
+  /// CHECK-DAG: BoundsCheck
+  /// CHECK-START: void Main.doWhileDownOOB() BCE (after)
+  /// CHECK-DAG: BoundsCheck
+  private static void doWhileDownOOB() {
+    int[] x = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+    int i = x.length - 1;
+    do {
+      sResult += x[i--];
+    } while (-1 <= i);
+  }
+
   //
   // Verifier.
   //
@@ -550,8 +603,10 @@
     expectEquals(66, linearWithVeryLargeNegativeStride());
 
     // Special forms.
-    expectEquals(55, linearForNE());
-    expectEquals(55, linearDoWhile());
+    expectEquals(55, linearForNEUp());
+    expectEquals(55, linearForNEDown());
+    expectEquals(55, linearDoWhileUp());
+    expectEquals(55, linearDoWhileDown());
     expectEquals(55, linearShort());
 
     // Periodic adds (1, 3), one at the time.
@@ -618,6 +673,23 @@
     }
     expectEquals(1055, sResult);
 
+    // Do while up goes OOB.
+    sResult = 0;
+    try {
+      doWhileUpOOB();
+    } catch (ArrayIndexOutOfBoundsException e) {
+      sResult += 1000;
+    }
+    expectEquals(1055, sResult);
+
+    // Do while down goes OOB.
+    sResult = 0;
+    try {
+      doWhileDownOOB();
+    } catch (ArrayIndexOutOfBoundsException e) {
+      sResult += 1000;
+    }
+    expectEquals(1055, sResult);
   }
 
   private static void expectEquals(int expected, int result) {
diff --git a/test/532-checker-nonnull-arrayset/expected.txt b/test/532-checker-nonnull-arrayset/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/532-checker-nonnull-arrayset/expected.txt
diff --git a/test/532-checker-nonnull-arrayset/info.txt b/test/532-checker-nonnull-arrayset/info.txt
new file mode 100644
index 0000000..e1578c8
--- /dev/null
+++ b/test/532-checker-nonnull-arrayset/info.txt
@@ -0,0 +1 @@
+Test that we optimize ArraySet when the value is not null.
diff --git a/test/532-checker-nonnull-arrayset/src/Main.java b/test/532-checker-nonnull-arrayset/src/Main.java
new file mode 100644
index 0000000..7d8fff4
--- /dev/null
+++ b/test/532-checker-nonnull-arrayset/src/Main.java
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+
+  // Check that we don't put a null check in the card marking code.
+
+  /// CHECK-START: void Main.test() instruction_simplifier (before)
+  /// CHECK:          ArraySet value_can_be_null:true
+
+  /// CHECK-START: void Main.test() instruction_simplifier (after)
+  /// CHECK:          ArraySet value_can_be_null:false
+
+  /// CHECK-START-X86: void Main.test() disassembly (after)
+  /// CHECK:          ArraySet value_can_be_null:false
+  /// CHECK-NOT:      test
+  /// CHECK:          ReturnVoid
+  public static void test() {
+    Object[] array = new Object[1];
+    Object nonNull = array[0];
+    nonNull.getClass(); // Ensure nonNull has an implicit null check.
+    array[0] = nonNull;
+  }
+
+  public static void main(String[] args) {}
+}
diff --git a/test/533-regression-debugphi/expected.txt b/test/533-regression-debugphi/expected.txt
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/533-regression-debugphi/expected.txt
diff --git a/test/533-regression-debugphi/info.txt b/test/533-regression-debugphi/info.txt
new file mode 100644
index 0000000..a4d4857
--- /dev/null
+++ b/test/533-regression-debugphi/info.txt
@@ -0,0 +1,2 @@
+Test a regression where DeadPhiHandling would infinitely loop over
+complicated phi dependencies.
diff --git a/test/533-regression-debugphi/smali/TestCase.smali b/test/533-regression-debugphi/smali/TestCase.smali
new file mode 100644
index 0000000..1908e72
--- /dev/null
+++ b/test/533-regression-debugphi/smali/TestCase.smali
@@ -0,0 +1,72 @@
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+.class public LTestCase;
+.super Ljava/lang/Object;
+
+# This is a reduced test case that used to trigger an infinite loop
+# in the DeadPhiHandling phase of the optimizing compiler (only used
+# with debuggable flag).
+.method public static testCase(IILjava/lang/Object;)V
+  .registers 5
+  const/4 v0, 0x0
+
+  :B4
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  goto :B7
+
+  :B7
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  if-nez p2, :Btmp
+  goto :B111
+
+  :Btmp
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  if-nez p2, :B9
+  goto :B110
+
+  :B13
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  add-int v0, p0, p1
+  goto :B7
+
+  :B110
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  add-int v0, p0, p1
+  goto :B111
+
+  :B111
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  goto :B4
+
+  :B9
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  if-nez p2, :B10
+
+  :B11
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  move v1, v0
+  goto :B12
+
+  :B10
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  move-object v1, p2
+  goto :B12
+
+  :B12
+  invoke-static {}, Ljava/lang/System;->nanoTime()J
+  goto :B13
+
+  return-void
+.end method
diff --git a/test/533-regression-debugphi/src/Main.java b/test/533-regression-debugphi/src/Main.java
new file mode 100644
index 0000000..858770f
--- /dev/null
+++ b/test/533-regression-debugphi/src/Main.java
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+  // Workaround for b/18051191.
+  class InnerClass {}
+
+  public static void main(String[] args) {}
+}
diff --git a/test/534-checker-bce-deoptimization/expected.txt b/test/534-checker-bce-deoptimization/expected.txt
new file mode 100644
index 0000000..3823a29
--- /dev/null
+++ b/test/534-checker-bce-deoptimization/expected.txt
@@ -0,0 +1 @@
+finish
diff --git a/test/534-checker-bce-deoptimization/info.txt b/test/534-checker-bce-deoptimization/info.txt
new file mode 100644
index 0000000..9f097d0
--- /dev/null
+++ b/test/534-checker-bce-deoptimization/info.txt
@@ -0,0 +1,8 @@
+Checker test for testing the behavior of deoptimization generated by
+bounds check elimination.
+
+The runtime used to trip on that test because it used to deopt the
+whole stack, and the compiler was not preserving dex registers at
+call sites.
+
+We fixed the bug by doing single frame deoptimization.
diff --git a/test/534-checker-bce-deoptimization/src/Main.java b/test/534-checker-bce-deoptimization/src/Main.java
new file mode 100644
index 0000000..8cd20f6
--- /dev/null
+++ b/test/534-checker-bce-deoptimization/src/Main.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class Main {
+    public static void main(String[] args) {
+        new Main().run();
+        System.out.println("finish");
+    }
+
+    public void run() {
+        double a[][] = new double[200][201];
+        double b[] = new double[200];
+        int n = 100;
+
+        foo1(a, n, b);
+    }
+
+    void foo1(double a[][], int n, double b[]) {
+        double t;
+        int i,k;
+
+        for (i = 0; i < n; i++) {
+            k = n - (i + 1);
+            b[k] /= a[k][k];
+            t = -b[k];
+            foo2(k + 1000, t, b);
+        }
+    }
+
+    void foo2(int n, double c, double b[]) {
+        try {
+            foo3(n, c, b);
+        } catch (Exception e) {
+        }
+    }
+
+    void foo3(int n, double c, double b[]) {
+        int i = 0;
+        for (i = 0; i < n; i++) {
+            b[i + 1] += c * b[i + 1];
+        }
+    }
+}
+
diff --git a/test/Android.run-test.mk b/test/Android.run-test.mk
index 4397ea4..e915357 100644
--- a/test/Android.run-test.mk
+++ b/test/Android.run-test.mk
@@ -418,19 +418,70 @@
 
 TEST_ART_BROKEN_DEFAULT_RUN_TESTS :=
 
-# Known broken tests for the arm64 optimizing compiler backend.
-TEST_ART_BROKEN_OPTIMIZING_ARM64_RUN_TESTS :=
+# Known broken tests for the mips32 optimizing compiler backend.
+TEST_ART_BROKEN_OPTIMIZING_MIPS_RUN_TESTS := \
+    441-checker-inliner \
+    442-checker-constant-folding \
+    444-checker-nce \
+    445-checker-licm \
+    446-checker-inliner2 \
+    447-checker-inliner3 \
+    449-checker-bce \
+    450-checker-types \
+    455-checker-gvn \
+    458-checker-instruction-simplification \
+    462-checker-inlining-across-dex-files \
+    463-checker-boolean-simplifier \
+    464-checker-inline-sharpen-calls \
+    465-checker-clinit-gvn \
+    468-checker-bool-simplifier-regression \
+    473-checker-inliner-constants \
+    474-checker-boolean-input \
+    476-checker-ctor-memory-barrier \
+    477-checker-bound-type \
+    478-checker-clinit-check-pruning \
+    478-checker-inliner-nested-loop \
+    480-checker-dead-blocks \
+    482-checker-loop-back-edge-use \
+    484-checker-register-hints \
+    485-checker-dce-loop-update \
+    485-checker-dce-switch \
+    486-checker-must-do-null-check \
+    487-checker-inline-calls \
+    488-checker-inline-recursive-calls \
+    490-checker-inline \
+    492-checker-inline-invoke-interface \
+    493-checker-inline-invoke-interface \
+    494-checker-instanceof-tests \
+    495-checker-checkcast-tests \
+    496-checker-inlining-and-class-loader \
+    508-checker-disassembly \
+    510-checker-try-catch \
+    517-checker-builder-fallthrough \
+    521-checker-array-set-null \
+    522-checker-regression-monitor-exit \
+    523-checker-can-throw-regression \
+    525-checker-arrays-and-fields \
+    526-checker-caller-callee-regs \
+    529-checker-unresolved \
+    530-checker-loops \
+    530-checker-regression-reftype-final \
+    532-checker-nonnull-arrayset \
 
-ifneq (,$(filter optimizing,$(COMPILER_TYPES)))
-  ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,target,$(RUN_TYPES),$(PREBUILD_TYPES), \
-      optimizing,$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES),$(JNI_TYPES), \
-      $(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES),$(TEST_ART_BROKEN_OPTIMIZING_ARM64_RUN_TESTS),64)
+ifeq (mips,$(TARGET_ARCH))
+  ifneq (,$(filter optimizing,$(COMPILER_TYPES)))
+    ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,target,$(RUN_TYPES),$(PREBUILD_TYPES), \
+        optimizing,$(RELOCATE_TYPES),$(TRACE_TYPES),$(GC_TYPES),$(JNI_TYPES), \
+        $(IMAGE_TYPES),$(PICTEST_TYPES),$(DEBUGGABLE_TYPES), \
+        $(TEST_ART_BROKEN_OPTIMIZING_MIPS_RUN_TESTS),$(ALL_ADDRESS_SIZES))
+  endif
 endif
 
-TEST_ART_BROKEN_OPTIMIZING_ARM64_RUN_TESTS :=
+TEST_ART_BROKEN_OPTIMIZING_MIPS_RUN_TESTS :=
 
 # Known broken tests for the optimizing compiler.
-TEST_ART_BROKEN_OPTIMIZING_RUN_TESTS :=
+TEST_ART_BROKEN_OPTIMIZING_RUN_TESTS := \
+  455-set-vreg \
 
 ifneq (,$(filter optimizing,$(COMPILER_TYPES)))
   ART_TEST_KNOWN_BROKEN += $(call all-run-test-names,$(TARGET_TYPES),$(RUN_TYPES),$(PREBUILD_TYPES), \
@@ -451,7 +502,6 @@
 # Tests that should fail when the optimizing compiler compiles them non-debuggable.
 TEST_ART_BROKEN_OPTIMIZING_NONDEBUGGABLE_RUN_TESTS := \
   454-get-vreg \
-  455-set-vreg \
   457-regs \
 
 ifneq (,$(filter optimizing,$(COMPILER_TYPES)))
diff --git a/tools/run-libcore-tests.sh b/tools/run-libcore-tests.sh
index 26d9ad7..80f7a37 100755
--- a/tools/run-libcore-tests.sh
+++ b/tools/run-libcore-tests.sh
@@ -86,12 +86,6 @@
     # Remove the --debug from the arguments.
     vogar_args=${vogar_args/$1}
     vogar_args="$vogar_args --vm-arg -XXlib:libartd.so"
-    if [ "$emulator" = "no" ]; then
-      # Increase the timeout, as vogar cannot set individual test
-      # timeout when being asked to run packages, and some tests go above
-      # the default timeout.
-      vogar_args="$vogar_args --timeout 240"
-    fi
     shift
   elif [[ "$1" == "" ]]; then
     break
@@ -100,10 +94,10 @@
   fi
 done
 
-if [ "$emulator" = "yes" ]; then
-  # Be very patient with the emulator.
-  vogar_args="$vogar_args --timeout 480"
-fi
+# Increase the timeout, as vogar cannot set individual test
+# timeout when being asked to run packages, and some tests go above
+# the default timeout.
+vogar_args="$vogar_args --timeout 480"
 
 # Run the tests using vogar.
 echo "Running tests for the following test packages:"